diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -24081,8 +24081,9 @@ const TargetLowering &TLI, SDValue &SV0, SDValue &SV1, SmallVectorImpl &Mask) -> bool { // Don't try to fold splats; they're likely to simplify somehow, or they - // might be free. - if (OtherSVN->isSplat()) + // might be free. But only if there are at least two non-undef indices. + if (OtherSVN->isSplat() && + count(OtherSVN->getMask(), OtherSVN->getSplatIndex()) > 1) return false; SV0 = SV1 = SDValue(); diff --git a/llvm/test/CodeGen/AArch64/shuffles.ll b/llvm/test/CodeGen/AArch64/shuffles.ll --- a/llvm/test/CodeGen/AArch64/shuffles.ll +++ b/llvm/test/CodeGen/AArch64/shuffles.ll @@ -4,17 +4,16 @@ define <16 x i32> @test_shuf1(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: test_shuf1: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v16.16b, v6.16b, v1.16b, #4 -; CHECK-NEXT: dup v5.4s, v4.s[0] -; CHECK-NEXT: uzp1 v17.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v18.4s, v2.4s, v4.4s +; CHECK-NEXT: ext v5.16b, v6.16b, v1.16b, #4 +; CHECK-NEXT: uzp1 v16.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp2 v17.4s, v2.4s, v4.4s +; CHECK-NEXT: ext v6.16b, v6.16b, v4.16b, #12 +; CHECK-NEXT: trn2 v4.4s, v1.4s, v5.4s ; CHECK-NEXT: rev64 v3.4s, v7.4s -; CHECK-NEXT: trn2 v4.4s, v1.4s, v16.4s -; CHECK-NEXT: mov v5.s[0], v6.s[3] -; CHECK-NEXT: trn2 v1.4s, v17.4s, v1.4s -; CHECK-NEXT: trn1 v2.4s, v18.4s, v2.4s +; CHECK-NEXT: trn2 v1.4s, v16.4s, v1.4s +; CHECK-NEXT: trn1 v2.4s, v17.4s, v2.4s ; CHECK-NEXT: mov v4.s[0], v7.s[1] -; CHECK-NEXT: mov v3.d[0], v5.d[0] +; CHECK-NEXT: mov v3.d[0], v6.d[0] ; CHECK-NEXT: ext v1.16b, v0.16b, v1.16b, #12 ; CHECK-NEXT: mov v2.s[3], v7.s[0] ; CHECK-NEXT: mov v0.16b, v4.16b diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll --- a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll @@ -242,103 +242,79 @@ ; CHECK-LE-P8-LABEL: test_none_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI2_0@toc@ha -; CHECK-LE-P8-NEXT: mtvsrd v3, r5 +; CHECK-LE-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P8-NEXT: mtvsrd v4, r5 ; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI2_1@toc@ha -; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI2_1@toc@l -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: vperm v2, v3, v3, v2 -; CHECK-LE-P8-NEXT: lxsiwzx v3, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd v4, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 ; CHECK-LE-P8-NEXT: xxswapd vs0, v2 ; CHECK-LE-P8-NEXT: stfdx f0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 ; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha -; CHECK-LE-P9-NEXT: mtvsrd v3, r5 +; CHECK-LE-P9-NEXT: mtfprd f1, r5 ; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-LE-P9-NEXT: lxv vs0, 0(r3) -; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI2_1@toc@ha -; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI2_1@toc@l -; CHECK-LE-P9-NEXT: lxv v4, 0(r3) -; CHECK-LE-P9-NEXT: xxperm v3, v3, vs0 -; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 -; CHECK-LE-P9-NEXT: xxswapd vs0, v2 +; CHECK-LE-P9-NEXT: lxv vs2, 0(r3) +; CHECK-LE-P9-NEXT: xxperm vs0, vs1, vs2 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P9-NEXT: stfd f0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_none_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI2_0@toc@ha -; CHECK-BE-P8-NEXT: mtvsrwz v3, r5 +; CHECK-BE-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r5 ; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI2_0@toc@l -; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r4 -; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI2_1@toc@ha -; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI2_1@toc@l -; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r4 -; CHECK-BE-P8-NEXT: vperm v2, v3, v3, v2 -; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r3 -; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-BE-P8-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-P8-NEXT: stxsdx v2, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_none_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 ; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha -; CHECK-BE-P9-NEXT: mtvsrwz v3, r5 +; CHECK-BE-P9-NEXT: mtfprwz f1, r5 ; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-BE-P9-NEXT: lxv vs0, 0(r3) -; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI2_1@toc@ha -; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI2_1@toc@l -; CHECK-BE-P9-NEXT: lxv v4, 0(r3) -; CHECK-BE-P9-NEXT: xxperm v3, v3, vs0 -; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 -; CHECK-BE-P9-NEXT: stxsd v2, 0(r3) +; CHECK-BE-P9-NEXT: lxv vs2, 0(r3) +; CHECK-BE-P9-NEXT: xxperm vs0, vs1, vs2 +; CHECK-BE-P9-NEXT: stfd f0, 0(r3) ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_none_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: ld r4, L..C3(r2) # %const.0 -; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r5 -; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r4 -; CHECK-AIX-64-P8-NEXT: ld r4, L..C4(r2) # %const.1 -; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v3, v2 -; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r3 -; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r4 -; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r5 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v4, v3 ; CHECK-AIX-64-P8-NEXT: stxsdx v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_none_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 ; CHECK-AIX-64-P9-NEXT: ld r3, L..C2(r2) # %const.0 -; CHECK-AIX-64-P9-NEXT: mtvsrwz v3, r5 -; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r3) -; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.1 -; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) -; CHECK-AIX-64-P9-NEXT: xxperm v3, v3, vs0 -; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 -; CHECK-AIX-64-P9-NEXT: stxsd v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r5 +; CHECK-AIX-64-P9-NEXT: lxv vs2, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxperm vs0, vs1, vs2 +; CHECK-AIX-64-P9-NEXT: stfd f0, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_none_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lwz r4, L..C3(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 ; CHECK-AIX-32-P8-NEXT: stb r5, -32(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C3(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: vmrghh v3, v3, v3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 ; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-32-P8-NEXT: lwz r3, -12(r1) @@ -349,14 +325,13 @@ ; ; CHECK-AIX-32-P9-LABEL: test_none_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r3 ; CHECK-AIX-32-P9-NEXT: lwz r3, L..C2(r2) # %const.0 ; CHECK-AIX-32-P9-NEXT: stb r5, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) -; CHECK-AIX-32-P9-NEXT: vmrghh v3, v3, v3 -; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 -; CHECK-AIX-32-P9-NEXT: stxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs2, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxperm vs1, vs0, vs2 +; CHECK-AIX-32-P9-NEXT: stxv vs1, -16(r1) ; CHECK-AIX-32-P9-NEXT: lwz r3, -12(r1) ; CHECK-AIX-32-P9-NEXT: stw r3, 0(r3) ; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) @@ -420,7 +395,7 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r4, L..C5(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r4, L..C4(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: xxlxor v4, v4, v4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 @@ -431,7 +406,7 @@ ; CHECK-AIX-64-P9-LABEL: test_v4i32_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C4(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: xxlxor vs2, vs2, vs2 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-64-P9-NEXT: xxperm vs0, vs2, vs1 @@ -531,7 +506,7 @@ ; ; CHECK-AIX-64-P8-LABEL: test_none_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C6(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C5(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 @@ -544,7 +519,7 @@ ; CHECK-AIX-64-P9-LABEL: test_none_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) -; CHECK-AIX-64-P9-NEXT: ld r3, L..C5(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C4(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r4) ; CHECK-AIX-64-P9-NEXT: xxlxor v3, v3, v3 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) @@ -635,7 +610,7 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r4, L..C7(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r4, L..C6(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: xxlxor v4, v4, v4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 @@ -646,7 +621,7 @@ ; CHECK-AIX-64-P9-LABEL: test_v2i64_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) -; CHECK-AIX-64-P9-NEXT: ld r3, L..C6(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C5(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: xxlxor vs2, vs2, vs2 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-64-P9-NEXT: xxperm vs0, vs2, vs1 @@ -742,7 +717,7 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C8(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C7(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) ; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r4) ; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r3 @@ -754,7 +729,7 @@ ; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C7(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C6(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r4 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 @@ -1035,7 +1010,7 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C9(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C8(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 @@ -1048,7 +1023,7 @@ ; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C8(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C7(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r4 ; CHECK-AIX-64-P9-NEXT: xxlxor v3, v3, v3 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) @@ -1328,7 +1303,7 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C10(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C9(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 @@ -1341,7 +1316,7 @@ ; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) -; CHECK-AIX-64-P9-NEXT: ld r3, L..C9(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C8(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r4) ; CHECK-AIX-64-P9-NEXT: xxlxor v3, v3, v3 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll --- a/llvm/test/CodeGen/X86/haddsub-3.ll +++ b/llvm/test/CodeGen/X86/haddsub-3.ll @@ -2,9 +2,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1,AVX1-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX1,AVX1-FAST -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 define float @pr26491(<4 x float> %a0) { ; SSE2-LABEL: pr26491: @@ -136,37 +136,15 @@ ; SSE2-NEXT: subps %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; SSSE3-SLOW-LABEL: PR48823: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm2 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2] -; SSSE3-SLOW-NEXT: subps %xmm2, %xmm0 -; SSSE3-SLOW-NEXT: retq +; SSSE3-LABEL: PR48823: +; SSSE3: # %bb.0: +; SSSE3-NEXT: hsubps %xmm1, %xmm0 +; SSSE3-NEXT: retq ; -; SSSE3-FAST-LABEL: PR48823: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: hsubps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: retq -; -; AVX1-SLOW-LABEL: PR48823: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm1[2,3] -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2] -; AVX1-SLOW-NEXT: vsubps %xmm2, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: PR48823: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vhsubps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: PR48823: -; AVX2: # %bb.0: -; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm1[2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2] -; AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: PR48823: +; AVX: # %bb.0: +; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %3 = shufflevector <4 x float> %0, <4 x float> poison, <4 x i32> %4 = fsub <4 x float> %0, %3 %5 = shufflevector <4 x float> %1, <4 x float> poison, <4 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -465,16 +465,22 @@ } define <2 x double> @add_pd_010(<2 x double> %x) { -; SSE-LABEL: add_pd_010: -; SSE: # %bb.0: -; SSE-NEXT: haddpd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE-SLOW-LABEL: add_pd_010: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movapd %xmm0, %xmm1 +; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: add_pd_010: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: add_pd_010: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: add_pd_010: @@ -608,7 +614,7 @@ ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-SLOW-NEXT: addps %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; @@ -621,7 +627,7 @@ ; AVX-SLOW-LABEL: add_ps_017: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2] +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -637,16 +643,22 @@ } define <4 x float> @add_ps_018(<4 x float> %x) { -; SSE-LABEL: add_ps_018: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] -; SSE-NEXT: retq +; SSE-SLOW-LABEL: add_ps_018: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: add_ps_018: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: add_ps_018: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: add_ps_018: @@ -655,11 +667,17 @@ ; AVX1-FAST-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] ; AVX1-FAST-NEXT: retq ; -; AVX512-LABEL: add_ps_018: -; AVX512: # %bb.0: -; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512-SLOW-LABEL: add_ps_018: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vbroadcastss %xmm0, %xmm1 +; AVX512-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: add_ps_018: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX512-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -928,30 +946,15 @@ } define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind { -; SSE-SLOW-LABEL: PR45747_1: -; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE-SLOW-NEXT: retq +; SSE-LABEL: PR45747_1: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: retq ; -; SSE-FAST-LABEL: PR45747_1: -; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: PR45747_1: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,2,2] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: PR45747_1: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: PR45747_1: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %t1 = fadd <4 x float> %t0, %a %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> @@ -976,7 +979,7 @@ ; AVX-SLOW-LABEL: PR45747_2: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,1,1] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -1012,34 +1015,15 @@ } define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) { -; SSE-SLOW-LABEL: PR34724_add_v4f32_0u23: -; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movaps %xmm0, %xmm2 -; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] -; SSE-SLOW-NEXT: addps %xmm2, %xmm0 -; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE-SLOW-NEXT: addps %xmm1, %xmm2 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-SLOW-NEXT: retq +; SSE-LABEL: PR34724_add_v4f32_0u23: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm1, %xmm0 +; SSE-NEXT: retq ; -; SSE-FAST-LABEL: PR34724_add_v4f32_0u23: -; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: haddps %xmm1, %xmm0 -; SSE-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: PR34724_add_v4f32_0u23: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1],xmm1[0,3] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,2] -; AVX-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: PR34724_add_v4f32_0u23: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: PR34724_add_v4f32_0u23: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> %4 = fadd <4 x float> %3, %0 %5 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -37,7 +37,7 @@ ; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 ; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] @@ -54,13 +54,15 @@ ; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> @@ -94,13 +96,9 @@ ; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32: @@ -113,47 +111,38 @@ ; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm2 +; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: retq ; -; AVX1-FAST-LABEL: pair_sum_v4i32_v4i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: retq +; AVX-FAST-LABEL: pair_sum_v4i32_v4i32: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1 +; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq ; ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: pair_sum_v4i32_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %7 = add <2 x i32> %5, %6 @@ -186,10 +175,7 @@ ; SSSE3-SLOW-LABEL: pair_sum_v8f32_v4f32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: haddps %xmm0, %xmm0 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2 ; SSSE3-SLOW-NEXT: movaps %xmm5, %xmm1 ; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm1 @@ -218,9 +204,7 @@ ; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 ; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 ; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 @@ -262,9 +246,7 @@ ; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 @@ -354,9 +336,7 @@ ; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2 ; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5 ; SSSE3-SLOW-NEXT: phaddd %xmm5, %xmm2 @@ -386,9 +366,7 @@ ; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 ; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 @@ -434,9 +412,7 @@ ; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 @@ -535,27 +511,24 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { ; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5 -; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5 +; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm0 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm0 +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm6 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm2[1,1] +; SSSE3-SLOW-NEXT: addps %xmm0, %xmm6 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,0] ; SSSE3-SLOW-NEXT: addps %xmm5, %xmm4 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,0] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] ; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: retq ; @@ -566,21 +539,16 @@ ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 ; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] -; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1] -; SSSE3-FAST-NEXT: addps %xmm5, %xmm4 -; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0 -; SSSE3-FAST-NEXT: haddps %xmm3, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSSE3-FAST-NEXT: addps %xmm0, %xmm1 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-FAST-NEXT: addps %xmm1, %xmm3 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSSE3-FAST-NEXT: addps %xmm5, %xmm4 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,0] +; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0 ; SSSE3-FAST-NEXT: retq ; @@ -592,7 +560,7 @@ ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,3] ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3] ; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] @@ -611,7 +579,7 @@ ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,3] ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3] ; AVX-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4 @@ -651,114 +619,102 @@ ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm4 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-SLOW-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] -; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5 -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm6 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm2[1,1] ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm6 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,0] ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: paddd %xmm5, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: sequential_sum_v4i32_v4i32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm4 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-FAST-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-FAST-NEXT: paddd %xmm0, %xmm4 -; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-FAST-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 -; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 -; SSSE3-FAST-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm5 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-FAST-NEXT: paddd %xmm5, %xmm6 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0] +; SSSE3-FAST-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-FAST-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSSE3-FAST-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] ; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0 +; SSSE3-FAST-NEXT: paddd %xmm5, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: sequential_sum_v4i32_v4i32: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX1-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,3] +; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3],zero +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] +; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm4, %xmm4 +; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: sequential_sum_v4i32_v4i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX1-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,3] +; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3],zero +; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm4 +; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; ; AVX2-SLOW-LABEL: sequential_sum_v4i32_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,3] +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm5 -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpaddd %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: retq @@ -766,22 +722,19 @@ ; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,3] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: retq @@ -962,25 +915,18 @@ ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -1008,17 +954,13 @@ ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1] -; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0] -; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX-SLOW-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; AVX-SLOW-NEXT: vhaddps %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -1051,23 +993,15 @@ ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32: @@ -1085,28 +1019,20 @@ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSSE3-FAST-NEXT: retq ; -; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-SLOW-NEXT: retq +; AVX-SLOW-LABEL: reduction_sum_v4i32_v4i32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX-SLOW-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32: ; AVX-FAST: # %bb.0: @@ -1122,28 +1048,6 @@ ; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX-FAST-NEXT: retq -; -; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: retq %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0) %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1) %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2) diff --git a/llvm/test/CodeGen/X86/phaddsub.ll b/llvm/test/CodeGen/X86/phaddsub.ll --- a/llvm/test/CodeGen/X86/phaddsub.ll +++ b/llvm/test/CodeGen/X86/phaddsub.ll @@ -431,35 +431,35 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { ; SSSE3-SLOW-LABEL: phaddd_single_source5: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: phaddd_single_source5: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: phaddd_single_source5: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: phaddd_single_source5: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-FAST-NEXT: retq ; ; AVX2-SHUF-LABEL: phaddd_single_source5: ; AVX2-SHUF: # %bb.0: -; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] -; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-SHUF-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %x @@ -468,17 +468,35 @@ } define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source6: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source6: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddd_single_source6: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddd_single_source6: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source6: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source6: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-FAST-NEXT: retq +; +; AVX2-SHUF-LABEL: phaddd_single_source6: +; AVX2-SHUF: # %bb.0: +; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SHUF-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -593,17 +611,36 @@ } define <8 x i16> @phaddw_single_source6(<8 x i16> %x) { -; SSSE3-LABEL: phaddw_single_source6: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddw_single_source6: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: pslld $16, %xmm1 +; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddw_single_source6: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddw_single_source6: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: pslld $16, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddw_single_source6: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddw_single_source6: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq +; +; AVX2-SHUF-LABEL: phaddw_single_source6: +; AVX2-SHUF: # %bb.0: +; AVX2-SHUF-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX2-SHUF-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX2-SHUF-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll --- a/llvm/test/CodeGen/X86/sse3.ll +++ b/llvm/test/CodeGen/X86/sse3.ll @@ -395,14 +395,14 @@ define <4 x i32> @t17() nounwind { ; X86-LABEL: t17: ; X86: # %bb.0: # %entry -; X86-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X86-NEXT: retl ; ; X64-LABEL: t17: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X64-NEXT: retq entry: %tmp1 = load <4 x float>, ptr undef, align 16 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -222,14 +222,16 @@ ; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] ; SSE-NEXT: pandn %xmm9, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 ; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1] ; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] @@ -241,7 +243,7 @@ ; SSE-NEXT: andnps %xmm3, %xmm2 ; SSE-NEXT: orps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm2, (%rax) -; SSE-NEXT: movq %xmm5, 48(%rax) +; SSE-NEXT: movq %xmm4, 48(%rax) ; SSE-NEXT: movdqa %xmm6, 32(%rax) ; SSE-NEXT: movdqa %xmm7, 16(%rax) ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -31,10 +31,9 @@ ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movq %xmm2, 32(%r9) +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: movq %xmm1, 32(%r9) ; SSE-NEXT: movaps %xmm6, (%r9) ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: retq @@ -322,75 +321,72 @@ ; SSE-LABEL: store_i32_stride5_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rsi), %xmm6 -; SSE-NEXT: movdqa 16(%rsi), %xmm3 -; SSE-NEXT: movdqa (%rdx), %xmm7 -; SSE-NEXT: movdqa 16(%rdx), %xmm10 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps (%rsi), %xmm6 +; SSE-NEXT: movaps 16(%rsi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdx), %xmm3 +; SSE-NEXT: movaps 16(%rdx), %xmm10 ; SSE-NEXT: movaps (%rcx), %xmm4 ; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps (%r8), %xmm0 +; SSE-NEXT: movaps (%r8), %xmm5 ; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm11[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm9[0,2] -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm0[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm13[0],xmm9[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[0,2] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm11[2,3] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,0] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm11[3,3] +; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm8[0,2] +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm11[2,3] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm12[2,0] +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm11[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm13[0,1] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm12[0,1] +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm14[0,2] ; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,0],xmm5[2,0] +; SSE-NEXT: movaps %xmm3, %xmm15 +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[2,0] ; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm14[0,1] ; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm6[0] ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,0] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,0] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm14, (%r9) ; SSE-NEXT: movaps %xmm4, 32(%r9) ; SSE-NEXT: movaps %xmm15, 48(%r9) -; SSE-NEXT: movaps %xmm13, 80(%r9) +; SSE-NEXT: movaps %xmm13, 64(%r9) +; SSE-NEXT: movaps %xmm12, 80(%r9) ; SSE-NEXT: movaps %xmm2, 112(%r9) -; SSE-NEXT: movaps %xmm12, 128(%r9) +; SSE-NEXT: movaps %xmm8, 128(%r9) +; SSE-NEXT: movaps %xmm9, 144(%r9) ; SSE-NEXT: movaps %xmm1, 16(%r9) -; SSE-NEXT: movaps %xmm9, 64(%r9) -; SSE-NEXT: movaps %xmm5, 96(%r9) -; SSE-NEXT: movaps %xmm8, 144(%r9) +; SSE-NEXT: movaps %xmm0, 96(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf8: @@ -683,180 +679,170 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa (%rsi), %xmm10 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa 32(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm9 -; SSE-NEXT: movdqa 32(%rdx), %xmm4 -; SSE-NEXT: movaps (%rcx), %xmm12 -; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps 32(%rcx), %xmm11 -; SSE-NEXT: movaps (%r8), %xmm3 -; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps 32(%r8), %xmm8 +; SSE-NEXT: subq $152, %rsp +; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: movaps (%rsi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] +; SSE-NEXT: movaps 16(%rsi), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm14 +; SSE-NEXT: movaps 16(%rdx), %xmm7 +; SSE-NEXT: movaps (%rcx), %xmm1 +; SSE-NEXT: movaps 16(%rcx), %xmm0 +; SSE-NEXT: movaps (%r8), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm11[2,0] +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm11[3,3] +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm14[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm15[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm7 -; SSE-NEXT: movaps 48(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movaps %xmm13, %xmm9 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[2,0] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] -; SSE-NEXT: movaps %xmm7, 288(%r9) -; SSE-NEXT: movaps %xmm3, 272(%r9) -; SSE-NEXT: movdqa %xmm5, 240(%r9) -; SSE-NEXT: movaps %xmm15, 208(%r9) +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm10[2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm10[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdx), %xmm7 +; SSE-NEXT: movaps 32(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 32(%rdi), %xmm15 +; SSE-NEXT: movaps 32(%rsi), %xmm13 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r8), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2] +; SSE-NEXT: movaps 48(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movaps 48(%rdi), %xmm5 +; SSE-NEXT: movaps 48(%rsi), %xmm9 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: movaps 48(%r8), %xmm8 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm8[2,3] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm8[3,3] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[0,2] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm14[0],xmm3[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm12[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm12[0],xmm14[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm7[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm7[0],xmm15[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3] +; SSE-NEXT: movaps %xmm0, 304(%r9) +; SSE-NEXT: movaps %xmm1, 288(%r9) +; SSE-NEXT: movaps %xmm4, 272(%r9) +; SSE-NEXT: movaps %xmm6, 240(%r9) +; SSE-NEXT: movaps %xmm13, 224(%r9) +; SSE-NEXT: movaps %xmm10, 208(%r9) ; SSE-NEXT: movaps %xmm11, 192(%r9) -; SSE-NEXT: movdqa %xmm6, 160(%r9) -; SSE-NEXT: movaps %xmm13, 128(%r9) -; SSE-NEXT: movaps %xmm9, 112(%r9) -; SSE-NEXT: movdqa %xmm12, 80(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%r9) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 304(%r9) -; SSE-NEXT: movaps %xmm4, 256(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps %xmm2, 176(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%r9) -; SSE-NEXT: movaps %xmm10, 96(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%r9) -; SSE-NEXT: movaps %xmm14, 16(%r9) -; SSE-NEXT: addq $168, %rsp +; SSE-NEXT: movaps %xmm5, 256(%r9) +; SSE-NEXT: movaps %xmm15, 176(%r9) +; SSE-NEXT: movaps %xmm14, 96(%r9) +; SSE-NEXT: movaps %xmm3, 16(%r9) +; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf16: @@ -1514,347 +1500,323 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $728, %rsp # imm = 0x2D8 -; SSE-NEXT: movdqa (%rsi), %xmm9 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm11 -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa 32(%rdx), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm5 -; SSE-NEXT: movaps 32(%rcx), %xmm6 -; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps 32(%r8), %xmm13 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm15[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm12 -; SSE-NEXT: movdqa 48(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm8 -; SSE-NEXT: movaps 48(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] +; SSE-NEXT: subq $664, %rsp # imm = 0x298 +; SSE-NEXT: movaps (%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0] +; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: movaps (%rsi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rsi), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm9 +; SSE-NEXT: movaps 16(%rdx), %xmm8 +; SSE-NEXT: movaps (%rcx), %xmm1 +; SSE-NEXT: movaps 16(%rcx), %xmm0 +; SSE-NEXT: movaps (%r8), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm10 +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm11[2,0] +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm11[3,3] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3] +; SSE-NEXT: movaps %xmm9, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm10[2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm10[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdx), %xmm4 +; SSE-NEXT: movaps 32(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rsi), %xmm5 ; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm13[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 64(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 80(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 96(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm14, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm2[1] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdx), %xmm4 +; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm4 +; SSE-NEXT: movaps 64(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 64(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdx), %xmm12 +; SSE-NEXT: movaps 80(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 80(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%r8), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm9 +; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdi), %xmm13 +; SSE-NEXT: movaps 96(%rsi), %xmm10 +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%r8), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] +; SSE-NEXT: movaps 112(%rdx), %xmm0 +; SSE-NEXT: movaps 112(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 112(%rsi), %xmm8 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: movaps 112(%r8), %xmm6 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm6[3,3] +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[0,2] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm15[0],xmm3[1,2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm15[0],xmm3[1,2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm15[0],xmm3[1,2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm15[0],xmm3[1,2,3] ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 608(%r9) -; SSE-NEXT: movaps %xmm12, 592(%r9) -; SSE-NEXT: movaps %xmm4, 560(%r9) -; SSE-NEXT: movaps %xmm7, 528(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm14[0],xmm15[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[1,1],mem[1,1] +; SSE-NEXT: movaps (%rsp), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm12[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm12[0],xmm14[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm9[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm13 = xmm9[0],xmm13[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm7, 624(%r9) +; SSE-NEXT: movaps %xmm1, 608(%r9) +; SSE-NEXT: movaps %xmm4, 592(%r9) +; SSE-NEXT: movaps %xmm5, 560(%r9) +; SSE-NEXT: movaps %xmm10, 544(%r9) +; SSE-NEXT: movaps %xmm11, 528(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 512(%r9) -; SSE-NEXT: movaps %xmm8, 480(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 480(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 464(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%r9) -; SSE-NEXT: movaps %xmm9, 432(%r9) -; SSE-NEXT: movaps %xmm13, 400(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1862,55 +1824,49 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) +; SSE-NEXT: movaps %xmm2, 576(%r9) +; SSE-NEXT: movaps %xmm13, 496(%r9) +; SSE-NEXT: movaps %xmm14, 416(%r9) +; SSE-NEXT: movaps %xmm15, 336(%r9) +; SSE-NEXT: movaps %xmm3, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 624(%r9) -; SSE-NEXT: movaps %xmm3, 576(%r9) +; SSE-NEXT: movaps %xmm0, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 544(%r9) -; SSE-NEXT: movaps %xmm5, 496(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 464(%r9) -; SSE-NEXT: movaps %xmm6, 416(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 384(%r9) -; SSE-NEXT: movaps %xmm11, 336(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 304(%r9) -; SSE-NEXT: movaps %xmm10, 256(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps %xmm2, 176(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%r9) -; SSE-NEXT: movaps %xmm15, 96(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%r9) +; SSE-NEXT: movaps %xmm0, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: addq $728, %rsp # imm = 0x2D8 +; SSE-NEXT: addq $664, %rsp # imm = 0x298 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf32: @@ -3276,835 +3232,781 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1736, %rsp # imm = 0x6C8 -; SSE-NEXT: movdqa (%rsi), %xmm12 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm8 -; SSE-NEXT: movdqa 16(%rdx), %xmm11 -; SSE-NEXT: movdqa 32(%rdx), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps 32(%rcx), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm7 -; SSE-NEXT: movaps 32(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm10 -; SSE-NEXT: movdqa 48(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm6 -; SSE-NEXT: movaps 48(%r8), %xmm13 -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdx), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm14 -; SSE-NEXT: movaps 64(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 128(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 144(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 160(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 176(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 192(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 192(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 208(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 240(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%r8), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm9, %xmm8 +; SSE-NEXT: subq $1688, %rsp # imm = 0x698 +; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: movaps (%rsi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movaps 80(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps 16(%rsi), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm9 +; SSE-NEXT: movaps 16(%rdx), %xmm8 +; SSE-NEXT: movaps (%rcx), %xmm1 +; SSE-NEXT: movaps 16(%rcx), %xmm0 +; SSE-NEXT: movaps (%r8), %xmm11 +; SSE-NEXT: movaps 16(%r8), %xmm10 +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 96(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 128(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 144(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 160(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 192(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm11[2,0] +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm11[3,3] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3] +; SSE-NEXT: movaps %xmm9, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm10[2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm10[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps 32(%rdx), %xmm4 +; SSE-NEXT: movaps 32(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 32(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movaps 240(%rdi), %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdx), %xmm4 +; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm4 +; SSE-NEXT: movaps 64(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 64(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdx), %xmm4 +; SSE-NEXT: movaps 80(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm4 +; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdx), %xmm4 +; SSE-NEXT: movaps 112(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdx), %xmm2 +; SSE-NEXT: movaps 128(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%r8), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdx), %xmm4 +; SSE-NEXT: movaps 144(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%r8), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm6[2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdx), %xmm4 +; SSE-NEXT: movaps 160(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 160(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdx), %xmm4 +; SSE-NEXT: movaps 176(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdx), %xmm7 +; SSE-NEXT: movaps 192(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 192(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%r8), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdx), %xmm8 +; SSE-NEXT: movaps 208(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%r8), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdx), %xmm5 +; SSE-NEXT: movaps 224(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm9 +; SSE-NEXT: movaps 224(%rsi), %xmm11 +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%r8), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] +; SSE-NEXT: movaps %xmm5, %xmm12 +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] +; SSE-NEXT: movaps 240(%rdx), %xmm0 +; SSE-NEXT: movaps 240(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 240(%rdi), %xmm3 +; SSE-NEXT: movaps 240(%rsi), %xmm14 +; SSE-NEXT: movaps %xmm3, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE-NEXT: movaps 240(%r8), %xmm13 +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm13[2,3] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm13[3,3] +; SSE-NEXT: movaps %xmm14, %xmm10 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm2[0,2] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm2[0],xmm15[1,2,3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[1,1],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm13 = xmm0[0],xmm13[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1248(%r9) -; SSE-NEXT: movaps %xmm3, 1232(%r9) +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm7[0],xmm2[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm8[0],xmm7[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm5[0],xmm9[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm13[0],xmm3[1,2,3] +; SSE-NEXT: movaps %xmm10, 1264(%r9) +; SSE-NEXT: movaps %xmm1, 1248(%r9) +; SSE-NEXT: movaps %xmm4, 1232(%r9) ; SSE-NEXT: movaps %xmm6, 1200(%r9) -; SSE-NEXT: movaps %xmm8, 1168(%r9) +; SSE-NEXT: movaps %xmm11, 1184(%r9) +; SSE-NEXT: movaps %xmm12, 1168(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1152(%r9) -; SSE-NEXT: movaps %xmm9, 1120(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1120(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1104(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1088(%r9) -; SSE-NEXT: movaps %xmm10, 1072(%r9) -; SSE-NEXT: movaps %xmm12, 1040(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1072(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1040(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1024(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1008(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 992(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 960(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 944(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 928(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 912(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 880(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 864(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 848(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 832(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 784(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 752(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 720(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 704(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 688(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 672(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 640(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 624(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 608(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 592(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 560(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 544(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 528(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 512(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 464(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 432(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1264(%r9) -; SSE-NEXT: movaps %xmm5, 1216(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1184(%r9) -; SSE-NEXT: movaps %xmm4, 1136(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1104(%r9) +; SSE-NEXT: movaps %xmm3, 1216(%r9) +; SSE-NEXT: movaps %xmm9, 1136(%r9) ; SSE-NEXT: movaps %xmm7, 1056(%r9) +; SSE-NEXT: movaps %xmm2, 976(%r9) +; SSE-NEXT: movaps %xmm15, 896(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1024(%r9) -; SSE-NEXT: movaps %xmm11, 976(%r9) +; SSE-NEXT: movaps %xmm0, 816(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 944(%r9) -; SSE-NEXT: movaps %xmm13, 896(%r9) +; SSE-NEXT: movaps %xmm0, 736(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 864(%r9) -; SSE-NEXT: movaps %xmm15, 816(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 784(%r9) -; SSE-NEXT: movaps %xmm2, 736(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 704(%r9) -; SSE-NEXT: movaps %xmm14, 656(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 624(%r9) +; SSE-NEXT: movaps %xmm0, 656(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 576(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 544(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 464(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 384(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 304(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: addq $1736, %rsp # imm = 0x6C8 +; SSE-NEXT: addq $1688, %rsp # imm = 0x698 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -313,50 +313,49 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm5 +; SSE-NEXT: movaps (%rdi), %xmm2 +; SSE-NEXT: movaps (%rsi), %xmm0 +; SSE-NEXT: movaps (%rdx), %xmm8 ; SSE-NEXT: movaps (%rcx), %xmm1 ; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps (%r9), %xmm2 -; SSE-NEXT: movaps (%r10), %xmm8 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] +; SSE-NEXT: movaps (%r9), %xmm3 +; SSE-NEXT: movaps (%r10), %xmm6 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm8, %xmm7 ; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm2[3,3] -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm0[3,3] +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3] +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm1[1,1] ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm9[2,0] -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,1] -; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm10[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm5[2,0] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm8[2,0] +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,1],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm10[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm11[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,2] ; SSE-NEXT: movaps %xmm4, 16(%rax) -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps %xmm2, 48(%rax) +; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps %xmm3, 48(%rax) +; SSE-NEXT: movaps %xmm8, 80(%rax) ; SSE-NEXT: movaps %xmm1, 96(%rax) ; SSE-NEXT: movaps %xmm7, 64(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) -; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps %xmm5, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf4: @@ -615,138 +614,108 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: subq $104, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm13 -; SSE-NEXT: movdqa 16(%rcx), %xmm9 -; SSE-NEXT: movdqa 16(%r8), %xmm11 -; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm4 -; SSE-NEXT: movaps 16(%r9), %xmm1 -; SSE-NEXT: movdqa (%rax), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps (%rsi), %xmm6 +; SSE-NEXT: movaps 16(%rsi), %xmm1 +; SSE-NEXT: movaps (%rdx), %xmm4 +; SSE-NEXT: movaps 16(%rdx), %xmm10 +; SSE-NEXT: movaps 16(%rcx), %xmm3 +; SSE-NEXT: movaps 16(%r8), %xmm8 +; SSE-NEXT: movaps 16(%r9), %xmm5 +; SSE-NEXT: movaps 16(%rax), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm5[3,3] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[1,1] +; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rdx), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps (%r8), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm13[0] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm10[3,3] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,1],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[2,0] -; SSE-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: movaps %xmm11, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm1[3,3] +; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm11[0,3] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm14[2,0] +; SSE-NEXT: movaps (%rax), %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm12[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,0] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm6[3,3] +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,1],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm12[2,0] +; SSE-NEXT: movaps (%r8), %xmm1 +; SSE-NEXT: movaps (%r9), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: movaps (%rcx), %xmm3 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[1,3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm1[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm8[0] -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[0,1],mem[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm11[2,0] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm1[0],xmm6[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm4[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm1[0],xmm7[1,2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm12[0] +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm0[3,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm9[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[0,2] +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm9[1,1] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm9[0] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm8[0] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm8[0,1] +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[0,1],mem[2,0] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm4[2,0] +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1],mem[0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm1[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm14, 112(%rax) -; SSE-NEXT: movdqa %xmm15, 176(%rax) -; SSE-NEXT: movaps %xmm9, (%rax) -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm6, 32(%rax) +; SSE-NEXT: movapd %xmm0, 48(%rax) +; SSE-NEXT: movaps %xmm8, 96(%rax) +; SSE-NEXT: movaps %xmm7, 112(%rax) +; SSE-NEXT: movaps %xmm5, 160(%rax) +; SSE-NEXT: movaps %xmm13, 176(%rax) +; SSE-NEXT: movaps %xmm14, (%rax) +; SSE-NEXT: movaps %xmm12, 16(%rax) ; SSE-NEXT: movaps %xmm2, 64(%rax) -; SSE-NEXT: movaps %xmm13, 128(%rax) -; SSE-NEXT: movaps %xmm12, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm7, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps %xmm6, 80(%rax) +; SSE-NEXT: movaps %xmm15, 80(%rax) +; SSE-NEXT: movaps %xmm10, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps %xmm11, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: addq $104, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf8: @@ -1274,293 +1243,223 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $536, %rsp # imm = 0x218 +; SSE-NEXT: subq $168, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm14 -; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm15 -; SSE-NEXT: movaps 16(%r8), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: movdqa 16(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm4 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movaps 32(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%r9), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps 48(%r8), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps 48(%rax), %xmm7 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r9), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm13 +; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps 16(%rsi), %xmm9 +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps 16(%rdx), %xmm15 +; SSE-NEXT: movaps (%rcx), %xmm10 +; SSE-NEXT: movaps (%r8), %xmm6 +; SSE-NEXT: movaps (%r9), %xmm3 +; SSE-NEXT: movaps (%rax), %xmm4 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm10[1,1] +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm6[3,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,0] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm0[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1,3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE-NEXT: movaps 16(%rcx), %xmm7 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] +; SSE-NEXT: movaps 16(%r9), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm4[2,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm13[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0,1],mem[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm7[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm7[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm6[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm6[0],xmm7[1,2,3] -; SSE-NEXT: shufps $255, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm6[0],xmm14[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm13 = xmm6[0],xmm13[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm6[0],xmm9[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm13[3,3] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm13[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm5[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm15[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE-NEXT: movaps 32(%rdx), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm1[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm15 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] +; SSE-NEXT: movaps 32(%rdi), %xmm4 +; SSE-NEXT: movaps 32(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[1,3] +; SSE-NEXT: movaps 32(%r8), %xmm6 +; SSE-NEXT: movaps 32(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm15[1,1] +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm6[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm2[2,0] +; SSE-NEXT: movaps 48(%rdx), %xmm4 +; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps 48(%rsi), %xmm12 +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,0] +; SSE-NEXT: movaps 48(%rax), %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[0,3] +; SSE-NEXT: movaps 48(%r8), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm0[1,1] +; SSE-NEXT: movaps 48(%r9), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,0] +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm10[2,0] +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm12[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 416(%rax) -; SSE-NEXT: movaps %xmm4, 400(%rax) -; SSE-NEXT: movaps %xmm3, 384(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rax) -; SSE-NEXT: movdqa %xmm5, 288(%rax) -; SSE-NEXT: movaps %xmm8, 240(%rax) -; SSE-NEXT: movdqa %xmm11, 224(%rax) -; SSE-NEXT: movaps %xmm10, 176(%rax) -; SSE-NEXT: movaps %xmm12, 128(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 432(%rax) +; SSE-NEXT: movaps %xmm1, 416(%rax) +; SSE-NEXT: movaps %xmm3, 400(%rax) +; SSE-NEXT: movaps %xmm2, 384(%rax) +; SSE-NEXT: movaps %xmm11, 368(%rax) +; SSE-NEXT: movaps %xmm9, 352(%rax) +; SSE-NEXT: movaps %xmm7, 336(%rax) +; SSE-NEXT: movaps %xmm15, 320(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 368(%rax) -; SSE-NEXT: movaps %xmm9, 320(%rax) -; SSE-NEXT: movaps %xmm13, 304(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 272(%rax) +; SSE-NEXT: movaps %xmm0, 304(%rax) +; SSE-NEXT: movaps %xmm13, 288(%rax) +; SSE-NEXT: movaps %xmm14, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) -; SSE-NEXT: movaps %xmm14, 208(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps %xmm2, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $536, %rsp # imm = 0x218 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf16: @@ -2816,615 +2715,455 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $1256, %rsp # imm = 0x4E8 +; SSE-NEXT: subq $616, %rsp # imm = 0x268 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm10 -; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm8 -; SSE-NEXT: movdqa 48(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm13 -; SSE-NEXT: movaps 48(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rsi), %xmm1 -; SSE-NEXT: movaps 64(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm12 -; SSE-NEXT: movdqa 80(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm4 -; SSE-NEXT: movaps 96(%rdx), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm3 -; SSE-NEXT: movaps 96(%r8), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%r9), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rax), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm3 -; SSE-NEXT: movaps 112(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 112(%r9), %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 112(%rax), %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm3 +; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps (%rsi), %xmm4 +; SSE-NEXT: movaps 16(%rsi), %xmm0 +; SSE-NEXT: movaps (%rdx), %xmm5 +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: movaps (%rcx), %xmm12 +; SSE-NEXT: movaps (%r8), %xmm7 +; SSE-NEXT: movaps (%r9), %xmm6 +; SSE-NEXT: movaps (%rax), %xmm8 +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[1,3] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm12[1,1] +; SSE-NEXT: movaps %xmm7, %xmm11 +; SSE-NEXT: movaps %xmm5, %xmm13 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm7[3,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm10[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,1],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[0,1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rcx), %xmm10 +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm3[2],xmm10[3],xmm3[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm9 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps 16(%r8), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm4[3,3] +; SSE-NEXT: movaps 16(%r9), %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm8[2,0] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[0,3] -; SSE-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm15[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[0,3] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[1,1] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm2[3,3] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm2[0,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm0[0],xmm12[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm1[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm4[3,3] +; SSE-NEXT: movaps 32(%rdx), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm3[2,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movaps 32(%rdi), %xmm4 +; SSE-NEXT: movaps 32(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[1,3] +; SSE-NEXT: movaps 32(%r8), %xmm6 +; SSE-NEXT: movaps 32(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdx), %xmm0 +; SSE-NEXT: movaps 48(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rax), %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[0,3] +; SSE-NEXT: movaps 48(%r8), %xmm6 +; SSE-NEXT: movaps 48(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm0 +; SSE-NEXT: movaps 64(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movaps 64(%rdi), %xmm4 +; SSE-NEXT: movaps 64(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[1,3] +; SSE-NEXT: movaps 64(%r8), %xmm6 +; SSE-NEXT: movaps 64(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdx), %xmm0 +; SSE-NEXT: movaps 80(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 80(%rdi), %xmm4 +; SSE-NEXT: movaps 80(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rax), %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[0,3] +; SSE-NEXT: movaps 80(%r8), %xmm6 +; SSE-NEXT: movaps 80(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm0 +; SSE-NEXT: movaps 96(%rcx), %xmm8 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE-NEXT: movaps 96(%rdi), %xmm4 +; SSE-NEXT: movaps 96(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[1,3] +; SSE-NEXT: movaps 96(%r8), %xmm7 +; SSE-NEXT: movaps 96(%r9), %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm8[1,1] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm8[2],xmm14[3],xmm8[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm7[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm2[2,0] +; SSE-NEXT: movaps 112(%rdx), %xmm4 +; SSE-NEXT: movaps 112(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movaps 112(%rdi), %xmm1 +; SSE-NEXT: movaps 112(%rsi), %xmm13 +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,0] +; SSE-NEXT: movaps 112(%rax), %xmm9 +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm9[0,3] +; SSE-NEXT: movaps 112(%r8), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm0[1,1] +; SSE-NEXT: movaps 112(%r9), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm5[0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm12[2,0] +; SSE-NEXT: movaps %xmm13, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm11[2,0] +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm11[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 864(%rax) -; SSE-NEXT: movaps %xmm5, 848(%rax) +; SSE-NEXT: movaps %xmm0, 880(%rax) +; SSE-NEXT: movaps %xmm1, 864(%rax) +; SSE-NEXT: movaps %xmm3, 848(%rax) +; SSE-NEXT: movaps %xmm2, 832(%rax) +; SSE-NEXT: movaps %xmm12, 816(%rax) +; SSE-NEXT: movaps %xmm10, 800(%rax) +; SSE-NEXT: movaps %xmm7, 784(%rax) +; SSE-NEXT: movaps %xmm8, 768(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 832(%rax) +; SSE-NEXT: movaps %xmm0, 752(%rax) +; SSE-NEXT: movaps %xmm14, 736(%rax) +; SSE-NEXT: movaps %xmm15, 720(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 704(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 688(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 672(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 656(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 640(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 624(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 608(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 800(%rax) -; SSE-NEXT: movaps %xmm4, 784(%rax) -; SSE-NEXT: movaps %xmm7, 736(%rax) -; SSE-NEXT: movaps %xmm8, 688(%rax) -; SSE-NEXT: movaps %xmm9, 672(%rax) -; SSE-NEXT: movaps %xmm10, 624(%rax) -; SSE-NEXT: movaps %xmm11, 576(%rax) -; SSE-NEXT: movaps %xmm13, 560(%rax) +; SSE-NEXT: movaps %xmm0, 592(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 576(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 560(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 544(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 512(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 496(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 480(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 400(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 288(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 880(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 816(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 768(%rax) -; SSE-NEXT: movaps %xmm14, 752(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 720(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 704(%rax) -; SSE-NEXT: movaps %xmm1, 656(%rax) -; SSE-NEXT: movaps %xmm2, 640(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 608(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 592(%rax) -; SSE-NEXT: movaps %xmm6, 544(%rax) -; SSE-NEXT: movaps %xmm12, 528(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 496(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 480(%rax) -; SSE-NEXT: movaps %xmm15, 432(%rax) +; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $1256, %rsp # imm = 0x4E8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: addq $616, %rsp # imm = 0x268 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf32: @@ -5949,1267 +5688,920 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $2760, %rsp # imm = 0xAC8 +; SSE-NEXT: subq $1512, %rsp # imm = 0x5E8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rsi), %xmm4 -; SSE-NEXT: movaps (%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm9 -; SSE-NEXT: movaps 16(%rcx), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r8), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rsi), %xmm1 -; SSE-NEXT: movaps 64(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r8), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm2 -; SSE-NEXT: movdqa 80(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm1 -; SSE-NEXT: movaps 96(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm2 -; SSE-NEXT: movdqa 112(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rsi), %xmm1 -; SSE-NEXT: movaps 128(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 128(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 128(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rsi), %xmm2 -; SSE-NEXT: movdqa 144(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 144(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rsi), %xmm1 -; SSE-NEXT: movaps 160(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 160(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 160(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rsi), %xmm2 -; SSE-NEXT: movdqa 176(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 176(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rsi), %xmm1 -; SSE-NEXT: movaps 192(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 192(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 192(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rsi), %xmm3 -; SSE-NEXT: movdqa 208(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 208(%rcx), %xmm8 -; SSE-NEXT: movaps 208(%r8), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rax), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 208(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps 224(%rcx), %xmm3 -; SSE-NEXT: movaps 224(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 224(%r9), %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 224(%rax), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 240(%rcx), %xmm8 -; SSE-NEXT: movaps 240(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 240(%r9), %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 240(%rax), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm4[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[1,3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm14[3,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps (%rdi), %xmm3 +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps (%rsi), %xmm4 +; SSE-NEXT: movaps 16(%rsi), %xmm0 +; SSE-NEXT: movaps (%rdx), %xmm5 +; SSE-NEXT: movaps 16(%rdx), %xmm2 +; SSE-NEXT: movaps (%rcx), %xmm11 +; SSE-NEXT: movaps (%r8), %xmm6 +; SSE-NEXT: movaps (%r9), %xmm7 +; SSE-NEXT: movaps (%rax), %xmm8 +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] ; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[1,3] +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm11[1,1] +; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movaps %xmm6, %xmm11 +; SSE-NEXT: movaps %xmm5, %xmm13 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm6[3,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm10[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,1],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[0,1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rcx), %xmm10 +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm4[3,3] +; SSE-NEXT: movaps 16(%r9), %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm8[2,0] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[0,3] +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[1,1] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm1[3,3] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm1[0,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm4[3,3] +; SSE-NEXT: movaps 32(%rdx), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm3[2,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movaps 32(%rdi), %xmm4 +; SSE-NEXT: movaps 32(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[1,3] +; SSE-NEXT: movaps 32(%r8), %xmm6 +; SSE-NEXT: movaps 32(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 224(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movaps 240(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdx), %xmm0 +; SSE-NEXT: movaps 48(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rax), %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[0,3] +; SSE-NEXT: movaps 48(%r8), %xmm6 +; SSE-NEXT: movaps 48(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm0 +; SSE-NEXT: movaps 64(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movaps 64(%rdi), %xmm4 +; SSE-NEXT: movaps 64(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[1,3] +; SSE-NEXT: movaps 64(%r8), %xmm6 +; SSE-NEXT: movaps 64(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm12[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm4[0],xmm12[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdx), %xmm0 +; SSE-NEXT: movaps 80(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 80(%rdi), %xmm4 +; SSE-NEXT: movaps 80(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rax), %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[0,3] +; SSE-NEXT: movaps 80(%r8), %xmm6 +; SSE-NEXT: movaps 80(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm0 +; SSE-NEXT: movaps 96(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movaps 96(%rdi), %xmm4 +; SSE-NEXT: movaps 96(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[1,3] +; SSE-NEXT: movaps 96(%r8), %xmm6 +; SSE-NEXT: movaps 96(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdx), %xmm0 +; SSE-NEXT: movaps 112(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps 112(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rax), %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[0,3] +; SSE-NEXT: movaps 112(%r8), %xmm6 +; SSE-NEXT: movaps 112(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdx), %xmm0 +; SSE-NEXT: movaps 128(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movaps 128(%rdi), %xmm4 +; SSE-NEXT: movaps 128(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[1,3] +; SSE-NEXT: movaps 128(%r8), %xmm6 +; SSE-NEXT: movaps 128(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdx), %xmm0 +; SSE-NEXT: movaps 144(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 144(%rdi), %xmm4 +; SSE-NEXT: movaps 144(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rax), %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[0,3] +; SSE-NEXT: movaps 144(%r8), %xmm6 +; SSE-NEXT: movaps 144(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdx), %xmm0 +; SSE-NEXT: movaps 160(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movaps 160(%rdi), %xmm4 +; SSE-NEXT: movaps 160(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[1,3] +; SSE-NEXT: movaps 160(%r8), %xmm6 +; SSE-NEXT: movaps 160(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdx), %xmm0 +; SSE-NEXT: movaps 176(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 176(%rdi), %xmm4 +; SSE-NEXT: movaps 176(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rax), %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[0,3] +; SSE-NEXT: movaps 176(%r8), %xmm6 +; SSE-NEXT: movaps 176(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdx), %xmm0 +; SSE-NEXT: movaps 192(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movaps 192(%rdi), %xmm4 +; SSE-NEXT: movaps 192(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[1,3] +; SSE-NEXT: movaps 192(%r8), %xmm6 +; SSE-NEXT: movaps 192(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdx), %xmm0 +; SSE-NEXT: movaps 208(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 208(%rdi), %xmm4 +; SSE-NEXT: movaps 208(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rax), %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[0,3] +; SSE-NEXT: movaps 208(%r8), %xmm6 +; SSE-NEXT: movaps 208(%r9), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[0,1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdx), %xmm5 +; SSE-NEXT: movaps 224(%rcx), %xmm15 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movaps 224(%rdi), %xmm13 +; SSE-NEXT: movaps 224(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rax), %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[0,3] +; SSE-NEXT: movaps 224(%r8), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm15[1,1] +; SSE-NEXT: movaps 224(%r9), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm15[2],xmm7[3],xmm15[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm6[2,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm4[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[0,2] +; SSE-NEXT: movaps 240(%rdx), %xmm4 +; SSE-NEXT: movaps 240(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movaps 240(%rdi), %xmm5 +; SSE-NEXT: movaps 240(%rsi), %xmm11 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,0] +; SSE-NEXT: movaps 240(%rax), %xmm9 +; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm9[0,3] +; SSE-NEXT: movaps 240(%r8), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm0[1,1] +; SSE-NEXT: movaps 240(%r9), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm14[2,0] +; SSE-NEXT: movaps %xmm11, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm10[2,0] +; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 1760(%rax) -; SSE-NEXT: movaps %xmm11, 1744(%rax) +; SSE-NEXT: movaps %xmm0, 1776(%rax) +; SSE-NEXT: movaps %xmm5, 1760(%rax) +; SSE-NEXT: movaps %xmm2, 1744(%rax) +; SSE-NEXT: movaps %xmm1, 1728(%rax) +; SSE-NEXT: movaps %xmm14, 1712(%rax) +; SSE-NEXT: movaps %xmm7, 1696(%rax) +; SSE-NEXT: movaps %xmm6, 1680(%rax) +; SSE-NEXT: movaps %xmm15, 1664(%rax) +; SSE-NEXT: movaps %xmm13, 1648(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1728(%rax) +; SSE-NEXT: movaps %xmm0, 1632(%rax) +; SSE-NEXT: movaps %xmm12, 1616(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1696(%rax) -; SSE-NEXT: movaps %xmm5, 1680(%rax) -; SSE-NEXT: movaps %xmm6, 1648(%rax) -; SSE-NEXT: movaps %xmm7, 1632(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1616(%rax) +; SSE-NEXT: movaps %xmm0, 1600(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1584(%rax) -; SSE-NEXT: movaps %xmm9, 1568(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1568(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1552(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1536(%rax) -; SSE-NEXT: movaps %xmm13, 1520(%rax) -; SSE-NEXT: movaps %xmm10, 1472(%rax) -; SSE-NEXT: movaps %xmm14, 1456(%rax) -; SSE-NEXT: movaps %xmm15, 1408(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1520(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1504(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1488(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1472(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1456(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1440(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1424(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1408(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1392(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1376(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1360(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1344(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1328(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1312(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1296(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1280(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1264(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1248(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1232(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1216(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1200(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1184(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1168(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1152(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1136(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1120(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1104(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1088(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1072(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1056(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1040(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1024(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1008(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 960(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 912(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 896(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 848(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 800(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 784(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 736(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 688(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 672(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 624(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 576(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 560(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 512(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 464(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 448(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 400(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 288(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1776(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1712(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1664(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1600(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1552(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1504(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1488(%rax) -; SSE-NEXT: movaps %xmm4, 1440(%rax) -; SSE-NEXT: movaps %xmm8, 1424(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1392(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1376(%rax) -; SSE-NEXT: movaps %xmm12, 1328(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1312(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1280(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1264(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1216(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1200(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1168(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1152(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1104(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1088(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1056(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1040(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 992(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 976(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 960(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 944(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 928(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 912(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 896(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 880(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 848(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 816(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 800(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 784(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 752(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 736(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 720(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 704(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 688(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 672(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 656(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 640(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 624(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 608(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 592(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 576(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 560(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 544(%rax) -; SSE-NEXT: movaps %xmm2, 528(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 528(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 512(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 464(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 448(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $2760, %rsp # imm = 0xAC8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: addq $1512, %rsp # imm = 0x5E8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -332,132 +332,130 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm3 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm7 ; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],mem[0],ymm10[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm5[1],ymm7[2,3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm10[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[2],ymm9[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm5[2],ymm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm5[0],ymm10[1,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm3[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm13[1],xmm11[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[1],ymm10[1],ymm8[2],ymm10[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm12[0],ymm9[1,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0],ymm6[1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm11 = mem[0],xmm11[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm6[2],ymm10[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm7[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 160(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 192(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 256(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 288(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 64(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 128(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 224(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 192(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 256(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 288(%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm9[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps %xmm2, 16(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm5, (%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm3, 176(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm6, 160(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm4, 176(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 192(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 224(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 288(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 256(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 128(%r9) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -780,173 +778,159 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $216, %rsp -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm13[1],ymm1[3],ymm13[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],mem[0],ymm8[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],mem[0],ymm15[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm8[0,1],ymm0[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm0[0],ymm5[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0,1],ymm6[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm0[0],mem[1,2,3] +; AVX1-ONLY-NEXT: subq $152, %rsp +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm11 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm6 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm5[1,2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0],ymm8[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm14[0],ymm0[1],ymm14[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm10[0],ymm8[0],ymm10[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0],ymm12[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm12 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm11[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm13[1,2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm10 = mem[0],xmm10[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm10[0,1],ymm15[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm10 = mem[0],xmm14[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm10[0,1],ymm14[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm10[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm10[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0,1],ymm0[2],ymm14[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendps $63, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm13[1],xmm8[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0],ymm13[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm13[0],ymm3[1,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm15[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm12[1],ymm0[1],ymm12[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm9[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm7[1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm7[2],ymm13[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm10, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 496(%r9) +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 496(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm6, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 320(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 576(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 512(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 320(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 576(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 544(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 512(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 448(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 384(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 352(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm14, 256(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 608(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 544(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 608(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $216, %rsp +; AVX1-ONLY-NEXT: addq $152, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $312, %rsp # imm = 0x138 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-ONLY-NEXT: subq $216, %rsp +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 @@ -955,137 +939,136 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm12 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm11[1],ymm1[1],ymm11[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm14[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm12 +; AVX2-ONLY-NEXT: vblendps $252, (%rsp), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm12[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm12[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm8[0,1],ymm15[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1],ymm8[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm6[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm6[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],ymm9[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm0[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm1[2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm14[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm1[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm15[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm10, 16(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm15, (%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm9, 496(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 480(%r9) +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps %xmm12, 16(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm9, (%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 496(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm8, 480(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm0, 176(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 160(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm1, 336(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm14, 320(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 576(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm7, 160(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 336(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm2, 320(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 608(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 576(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 544(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 512(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 512(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 416(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 384(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 288(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1098,12 +1081,9 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 608(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-ONLY-NEXT: addq $312, %rsp # imm = 0x138 +; AVX2-ONLY-NEXT: addq $216, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1714,309 +1694,273 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm2[0],ymm9[2],ymm2[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovapd %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm0[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1],ymm4[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm14[0],ymm9[0],ymm14[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm15[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm15[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm4[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm4[0,1],ymm5[2],ymm4[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm4[0],ymm8[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm8[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm8[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm14[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm9[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, (%rsp), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm9[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm15[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0],ymm9[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0],ymm9[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm9[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[1],ymm0[1],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm14 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm14[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm2[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm14[0],ymm12[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0],ymm14[1],mem[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0],ymm8[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0],ymm14[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0],ymm9[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0],ymm9[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm9[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0],ymm6[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm0[1],ymm6[1],ymm0[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm11[2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm11 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0],ymm11[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm11[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0],ymm9[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm7[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm0[1],ymm6[1],ymm0[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm9 +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm11[2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0],ymm12[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm12[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0],ymm9[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm10[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm10[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 976(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 960(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1136(%r9) +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm13, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 976(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 960(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1136(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm9, 1120(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 816(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 800(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 816(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 800(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 336(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm15, 320(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 656(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 640(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 656(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 640(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2024,20 +1968,32 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1152(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1088(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r9) @@ -2048,40 +2004,28 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1248(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1088(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: addq $936, %rsp # imm = 0x3A8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2090,9 +2034,11 @@ ; AVX2-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 @@ -2101,22 +2047,22 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm1 @@ -2126,19 +2072,21 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2151,180 +2099,110 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rsi), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm13[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm13 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm13[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1],ymm13[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm11 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm11[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1],ymm11[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, (%rsp), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm7 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm7[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm7 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm7[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm8 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm8[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2335,69 +2213,141 @@ ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps $252, (%rsp), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm7 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm7[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm8, 16(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm9, (%r9) +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps %xmm4, 16(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm15, (%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm0, 976(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm11, 960(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm5, 960(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm1, 1136(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 1120(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 1120(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm2, 816(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 800(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 800(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm3, 496(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm14, 480(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm4, 176(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm15, 160(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm5, 336(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm10, 320(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm6, 656(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm7, 640(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm8, 480(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm14, 176(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm13, 160(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm11, 336(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 320(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm7, 656(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm6, 640(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1248(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2408,9 +2358,11 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 864(%r9) @@ -2423,6 +2375,8 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r9) @@ -2434,7 +2388,9 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%r9) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r9) @@ -2454,14 +2410,6 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1248(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r9) ; AVX2-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -3695,664 +3643,579 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2280, %rsp # imm = 0x8E8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 272(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 464(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm15 -; AVX1-ONLY-NEXT: vmovapd 480(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: subq $2248, %rsp # imm = 0x8C8 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd %ymm6, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 256(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vbroadcastsd 264(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 296(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 328(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 360(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 392(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 424(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 320(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 456(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm8[2],ymm15[3] -; AVX1-ONLY-NEXT: vmovapd %ymm15, %ymm11 -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm15[1],xmm9[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0],ymm15[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0],ymm9[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm9[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm7[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm2 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0],ymm2[0],ymm6[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm2 = mem[0],xmm6[1] ; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 304(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm13[2,3] +; AVX1-ONLY-NEXT: vmovapd 368(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm7[0],ymm6[0],ymm7[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm7[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm4[1],ymm6[1],ymm4[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[0],ymm8[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0],ymm4[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm6 ; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm6[1],mem[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 304(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 304(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 288(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0],ymm5[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm5[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 368(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 368(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 352(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0],ymm4[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm0[1],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0],ymm5[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm4 ; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[0],ymm4[1],mem[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm15[0,1],ymm4[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0],ymm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm4[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm1[2],mem[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0],ymm4[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm1[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1],ymm1[2],mem[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 272(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 264(%rsi), %ymm4 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovapd 256(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm4[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0],ymm2[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 296(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 432(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm3[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm3[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm1[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 496(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovapd 336(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 328(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 320(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm4[1],mem[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0],ymm10[1,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0],ymm2[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 360(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 400(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 392(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 384(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm14[1],mem[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0],ymm2[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 424(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 464(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm1[1],ymm0[1],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 456(%rsi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0],ymm12[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm14[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0],ymm0[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 488(%rsi), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 480(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2,3],ymm12[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm12, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 1936(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 1920(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm0, 2256(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 2240(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 1936(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 1920(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 2256(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 2240(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm0, 2416(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 2400(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm1, 2096(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 2080(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1616(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1600(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 1776(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 1760(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1456(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 1440(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 2400(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 2096(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 2080(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 1616(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 1600(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 1776(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 1760(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 1456(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 1440(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 976(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 960(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 1136(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 1120(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 960(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1136(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1120(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) @@ -4371,8 +4234,12 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2496(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 2464(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2432(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 2368(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2304(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2272(%r9) @@ -4383,6 +4250,10 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2112(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 2048(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1984(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1952(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1856(%r9) @@ -4391,12 +4262,20 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1792(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1728(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1664(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1632(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1536(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1472(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1344(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1312(%r9) @@ -4407,20 +4286,32 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1152(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1088(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r9) @@ -4431,82 +4322,56 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2528(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 2464(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 2368(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2336(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2208(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 2048(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2016(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1984(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1888(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1728(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1696(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1664(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1568(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1376(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1248(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1088(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $2280, %rsp # imm = 0x8E8 +; AVX1-ONLY-NEXT: addq $2248, %rsp # imm = 0x8C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $2440, %rsp # imm = 0x988 +; AVX2-ONLY-NEXT: subq $2392, %rsp # imm = 0x958 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 @@ -4515,13 +4380,13 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm1 @@ -4530,8 +4395,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm1 @@ -4541,19 +4406,21 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4566,7 +4433,7 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4577,9 +4444,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4592,7 +4461,7 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4603,9 +4472,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4618,7 +4489,7 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4629,9 +4500,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4644,7 +4517,7 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4655,9 +4528,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4670,7 +4545,7 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4681,9 +4556,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4696,7 +4573,7 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4707,54 +4584,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 312(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 376(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 440(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 504(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm10 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3,4,5,6,7] @@ -4782,35 +4620,35 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 256(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 280(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm10[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 280(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 344(%rsi), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 408(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 344(%rsi), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 408(%rsi), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm14 ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 @@ -4820,26 +4658,30 @@ ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm10[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm10[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1],ymm10[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -4856,21 +4698,25 @@ ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -4887,21 +4733,25 @@ ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -4918,21 +4768,25 @@ ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -4949,23 +4803,27 @@ ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm1 @@ -4975,28 +4833,33 @@ ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm1 @@ -5011,21 +4874,25 @@ ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -5043,21 +4910,25 @@ ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -5123,12 +4994,12 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm10 @@ -5144,9 +5015,9 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm0, 1936(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm10, 1920(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm1, 2256(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm11, 2240(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 2240(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm2, 2416(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 2400(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm11, 2400(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm3, 2096(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm13, 2080(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm4, 1616(%r9) @@ -5188,6 +5059,8 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 1280(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2528(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2464(%r9) @@ -5200,6 +5073,8 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2272(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2208(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2176(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2144(%r9) @@ -5212,6 +5087,8 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1952(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1888(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1856(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1824(%r9) @@ -5224,6 +5101,8 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1632(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1568(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1536(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%r9) @@ -5236,6 +5115,8 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1312(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1248(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%r9) @@ -5248,6 +5129,8 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 864(%r9) @@ -5260,6 +5143,8 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r9) @@ -5272,6 +5157,8 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r9) @@ -5299,23 +5186,7 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2528(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2208(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1888(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1568(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1248(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-ONLY-NEXT: addq $2440, %rsp # imm = 0x988 +; AVX2-ONLY-NEXT: addq $2392, %rsp # imm = 0x958 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -197,51 +197,52 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%r10), %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm5 +; AVX1-ONLY-NEXT: vmovapd (%r10), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm8 +; AVX1-ONLY-NEXT: vmovapd (%r10), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vmovapd (%r9), %xmm8 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm8[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r9), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm12 = ymm4[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 16(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r9), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 160(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -531,216 +532,215 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: pushq %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm8 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm0 -; AVX1-ONLY-NEXT: vmovapd 32(%rax), %xmm9 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm2 +; AVX1-ONLY-NEXT: vmovapd (%rax), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovapd 16(%r8), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovapd (%rax), %xmm13 +; AVX1-ONLY-NEXT: vmovapd 32(%rax), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovapd (%r9), %xmm14 +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %xmm12 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm13[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm15[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm14[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm13[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm7, %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm13[1],ymm7[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[2],ymm14[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm0[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm14 = ymm14[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0],ymm0[1],ymm14[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm15[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm5[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3],ymm7[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm2[0],ymm9[1],ymm2[3],ymm9[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm9[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[1],ymm15[1],ymm9[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm15[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 48(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 352(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 384(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 416(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 320(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 384(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: popq %rax +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf8: ; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: pushq %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm13 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm13[0] +; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm14[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm12[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm13, %ymm9 -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm4, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm15[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm10[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm13[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm14[0],xmm15[0] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm2, 16(%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm13, (%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 384(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 224(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 288(%rdi) +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm9 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm11[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps %xmm9, 16(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm10, (%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rdi) +; AVX2-ONLY-NEXT: popq %rax ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2004,236 +2004,226 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $552, %rsp # imm = 0x228 +; AVX1-ONLY-NEXT: subq $408, %rsp # imm = 0x198 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd (%rax), %ymm6 +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm15 +; AVX1-ONLY-NEXT: vmovapd 16(%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovapd (%rax), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm8 ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm0[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vmovapd (%r9), %xmm13 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[3],ymm1[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %xmm7 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vmovapd 48(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[3],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 64(%r9), %xmm10 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm10[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 64(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 80(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 64(%rax), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm15[0],xmm2[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 112(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 112(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm1[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 96(%rax), %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r9), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm2, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm15[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm0[0],ymm2[1],ymm0[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm10[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[2],ymm4[3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0],ymm4[1,2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = ymm8[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 96(%rax), %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r9), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm11, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 464(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 448(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 832(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 576(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 864(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 800(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm14, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, (%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 448(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 832(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 704(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 608(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 576(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 384(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 352(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 320(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 864(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 800(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 640(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) @@ -2242,244 +2232,251 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 +; AVX1-ONLY-NEXT: addq $408, %rsp # imm = 0x198 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 +; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm2[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm4 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm2[0],xmm3[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm13[1],ymm5[3],ymm13[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 80(%rax), %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm2[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm12[1],ymm8[3],ymm12[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovaps 80(%rax), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovaps 112(%rax), %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm5 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm6[1],ymm10[3],ymm6[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 112(%rax), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm13 +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm13[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm9[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[2],ymm13[2] -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm9 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm9[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],xmm9[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%r9), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm12[0],mem[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm9[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm15 +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm8 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm15[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[2],ymm6[2] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm14[0],xmm5[0] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm11[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm13, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm15[0] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm8, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm5 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%r9), %ymm5 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm5[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm11, 16(%rax) +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps %xmm9, 16(%rax) ; AVX2-ONLY-NEXT: vmovaps %xmm6, (%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm9, 464(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm8, 448(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 800(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 768(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 576(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 544(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 512(%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 320(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 832(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 736(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 704(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 672(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 640(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm5, 464(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm2, 448(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 864(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 800(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 768(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 576(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 608(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 544(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 512(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 832(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 736(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 704(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 640(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 480(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 416(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 192(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 864(%rax) -; AVX2-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -4735,146 +4732,88 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1320, %rsp # imm = 0x528 +; AVX1-ONLY-NEXT: subq $1384, %rsp # imm = 0x568 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm6 +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd (%rax), %ymm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm7[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[3],ymm1[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 48(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[3],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 64(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -4883,628 +4822,771 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 112(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 96(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 144(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 144(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm13 +; AVX1-ONLY-NEXT: vmovapd 128(%rax), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 144(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 128(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovapd 160(%r9), %xmm11 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 160(%rax), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 176(%r8), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovapd 160(%rax), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%r9), %xmm10 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovapd 192(%rax), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm2[2,3],ymm15[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm15[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm3[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 128(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm3[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 192(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 208(%rax), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rcx), %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1,2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 224(%rax), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2],ymm14[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0],ymm0[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm12[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm10[0],xmm11[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 208(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm4[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 192(%rax), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0],ymm2[1,2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = ymm8[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 224(%rax), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm2[1],ymm8[2],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm2[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0],ymm15[1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm4[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm10, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm9, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 1360(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 1344(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 464(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 912(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 896(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 1760(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 1728(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 1696(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 1664(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 1632(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 1600(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 1568(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 1536(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm12, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 1360(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 1344(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 912(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 896(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 1728(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 1664(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 1600(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 1504(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 1472(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 1440(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1472(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1440(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1248(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1216(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%rax) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 960(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1376(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1312(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 800(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1280(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1248(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1216(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1184(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1152(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1120(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1088(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 960(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 1760(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1696(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1632(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 800(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1568(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 1536(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1376(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1312(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1184(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1152(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 640(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1120(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1088(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 640(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX1-ONLY-NEXT: addq $1320, %rsp # imm = 0x528 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: addq $1384, %rsp # imm = 0x568 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 +; AVX2-ONLY-NEXT: subq $1768, %rsp # imm = 0x6E8 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm5 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm8[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm4[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm1 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 80(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 80(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 112(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps 144(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm9 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 176(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps 208(%rax), %xmm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovaps 240(%rax), %xmm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm0[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm0[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%r9), %xmm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm3[1],xmm0[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm6[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] +; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm2[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm7 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rax), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 208(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm6 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rax), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm2 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 240(%rax), %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] @@ -5518,156 +5600,177 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm14[0] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm15 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm14 ; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm10, %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm10 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm9[0],xmm3[0] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm14[0],mem[0] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm8, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm4[1],ymm13[1],ymm4[3],ymm13[3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm11 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm11 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm11 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[0,1],ymm11[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm14[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm14 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm11, 16(%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 1360(%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm8, 1344(%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm15, 464(%rdi) +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps %xmm4, 16(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 1360(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 1344(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 464(%rdi) ; AVX2-ONLY-NEXT: vmovaps %xmm0, 448(%rdi) ; AVX2-ONLY-NEXT: vmovaps %xmm14, 912(%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm10, 896(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1760(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1728(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 1696(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 1664(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1632(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm11, 896(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1760(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 1696(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 1664(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1600(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 1568(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 1536(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1536(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1472(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1376(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 1312(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1280(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1248(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1216(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1120(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1088(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1024(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 800(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1728(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1632(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 1568(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 1376(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1312(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1280(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 1120(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1088(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 864(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 832(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 800(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 736(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 704(%rdi) @@ -5678,22 +5781,12 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rdi) @@ -5704,14 +5797,8 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdi) -; AVX2-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX2-ONLY-NEXT: addq $1768, %rsp # imm = 0x6E8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -10377,130 +10464,88 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3960, %rsp # imm = 0xF78 +; AVX1-ONLY-NEXT: subq $3320, %rsp # imm = 0xCF8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd (%rax), %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[3],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 48(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[3],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 64(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -10509,80 +10554,55 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 112(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 96(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 144(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 144(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 144(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 128(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -10591,80 +10611,55 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 176(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 160(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 208(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 208(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 192(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -10673,80 +10668,55 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 240(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 240(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 224(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 256(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 256(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 264(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 256(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 272(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 272(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 256(%rax), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 256(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 272(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 256(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -10755,551 +10725,622 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 288(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rax), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 288(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 296(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 304(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 304(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 296(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 288(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 304(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 288(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 320(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 320(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 320(%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rax), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 328(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 336(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 336(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 336(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 320(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 336(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 320(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 352(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 352(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rcx), %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 352(%r9), %xmm12 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 352(%rax), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 352(%r8), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 360(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 360(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 368(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 368(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 368(%rcx), %xmm15 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%r9), %xmm14 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm14[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 384(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 384(%rax), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 392(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 400(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%r9), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 400(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 400(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 352(%r9), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 368(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 352(%rax), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 384(%r9), %xmm13 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 384(%rax), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 384(%r8), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 384(%r9), %ymm10 +; AVX1-ONLY-NEXT: vmovapd 400(%r8), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 384(%rax), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rcx), %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rcx), %ymm3, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[0],ymm7[1],ymm3[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 416(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm9[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovapd 416(%r9), %xmm3 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovapd 416(%rax), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 448(%rdx), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 464(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 464(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 448(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm12[0],xmm9[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rcx), %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 416(%r8), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 496(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3] -; AVX1-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 464(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 464(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 448(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rcx), %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r8), %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 480(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm14[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 128(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm14[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 192(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm14[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 264(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 256(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 256(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 272(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 272(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm14[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 288(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 304(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 328(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 320(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 320(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm14[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 352(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 352(%r8), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovapd 368(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 368(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 392(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 384(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 384(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovapd 400(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm5[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 432(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovapd 432(%rcx), %xmm8 ; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 432(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 416(%rax), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm1[1],ymm5[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 416(%rax), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rax), %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rax), %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm10[0],ymm1[1,2,3] -; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm6[1],ymm3[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 448(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm4[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rax), %ymm11, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm3[1,2,3] -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rax), %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm5[1],ymm3[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm8[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm9[3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0],ymm0[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm12[0],ymm1[1],ymm12[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm9, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm11, 16(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 2704(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 2704(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 2688(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 3152(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 3136(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 2256(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 3152(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 3136(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 2256(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 2240(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 1360(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 1360(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm3, 1344(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 464(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 912(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 896(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 1808(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 1792(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 912(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 896(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 1808(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 1792(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3520(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11311,66 +11352,108 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3232(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 3200(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3168(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3072(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3040(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3008(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 2848(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2816(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2784(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 2752(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 2624(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2592(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2560(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 2400(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2368(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2336(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 2304(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 2176(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2144(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2112(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1952(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1920(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1888(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1856(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1728(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1696(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1664(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1472(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1440(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1280(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1248(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1216(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 960(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 800(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3552(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3488(%rax) @@ -11383,8 +11466,6 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3264(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 3200(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3104(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2976(%rax) @@ -11395,16 +11476,10 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2880(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 2848(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 2752(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2720(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2656(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 2624(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2528(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2496(%rax) @@ -11413,16 +11488,10 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2432(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 2400(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 2304(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2272(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2208(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 2176(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2080(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2048(%rax) @@ -11431,16 +11500,10 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1984(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1952(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1856(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1824(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1760(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1728(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1632(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1600(%rax) @@ -11449,16 +11512,10 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1536(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1376(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1312(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1280(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1184(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1152(%rax) @@ -11467,16 +11524,10 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1088(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 960(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) @@ -11485,16 +11536,10 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 640(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) @@ -11503,873 +11548,989 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $3960, %rsp # imm = 0xF78 +; AVX1-ONLY-NEXT: addq $3320, %rsp # imm = 0xCF8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3624, %rsp # imm = 0xE28 +; AVX2-ONLY-NEXT: subq $4264, %rsp # imm = 0x10A8 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm11 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovaps 80(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 112(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovaps 144(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 176(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovaps 208(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 240(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 264(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%r8), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rax), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovaps 272(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vmovaps 288(%rax), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 304(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 328(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r9), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 336(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 352(%rax), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vbroadcastsd 360(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%r9), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 368(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 392(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm6 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r9), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovaps 304(%rax), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 336(%rax), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 352(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 360(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 368(%rax), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps 400(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 416(%rax), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 416(%rax), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 432(%rax), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 464(%rax), %xmm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovaps 496(%rax), %xmm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm13 +; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm13[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm13 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm13[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm7[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%r9), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm6[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm5[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 264(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%r8), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 256(%rax), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%r9), %xmm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm15[1],xmm5[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%r9), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%r9), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 328(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm12 +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 320(%r9), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 320(%r9), %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 392(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 384(%r9), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 384(%r9), %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm8 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm8[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 448(%rax), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; AVX2-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovaps 432(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm7 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm13[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 448(%rax), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[0,1],ymm12[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 480(%rax), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 280(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 288(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 344(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 352(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm4[0] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd (%rsp), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm9[1],ymm3[3],ymm9[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 408(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 416(%rcx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovaps 464(%rax), %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm14 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm14[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 480(%rax), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm10 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 480(%rcx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps 496(%rax), %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm11, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 280(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 288(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm7[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 344(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 352(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 408(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd 416(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],mem[0],ymm8[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 472(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps 448(%rax), %ymm3 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd 480(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rax), %ymm8 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm8[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm8[1],ymm3[3],ymm8[3] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 448(%rax), %ymm4 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm4[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rax), %ymm7 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm15[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %xmm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm14 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %xmm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %xmm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %xmm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm5, 16(%r8) +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps %xmm4, 16(%r8) ; AVX2-ONLY-NEXT: vmovaps %xmm0, (%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm14, 2704(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm4, 2688(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm8, 3152(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm10, 3136(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm6, 2256(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm7, 2240(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 1360(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm9, 1344(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 464(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm1, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm15, 912(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm11, 896(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm2, 1808(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm3, 1792(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm6, 2704(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 2688(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm7, 3152(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm8, 3136(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 2256(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm2, 2240(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 1360(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm11, 1344(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 464(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm13, 448(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm14, 912(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 896(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm3, 1808(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm5, 1792(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3552(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3520(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3488(%r8) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3456(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3424(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3456(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3392(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3360(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3328(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3296(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3264(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3232(%r8) @@ -12380,12 +12541,86 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3104(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3072(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3040(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3008(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2816(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2784(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2752(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2592(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2560(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2368(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2336(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2304(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2144(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2112(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1920(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1888(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1856(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1696(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1664(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1472(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1248(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1216(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1024(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 800(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3520(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3424(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3360(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3296(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3072(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2976(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2944(%r8) @@ -12396,22 +12631,12 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2848(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2816(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2784(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2752(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2720(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2656(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2624(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2592(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2560(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2528(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%r8) @@ -12422,22 +12647,12 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2400(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2368(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2336(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2304(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2272(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2208(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2176(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2144(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2112(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2080(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2048(%r8) @@ -12448,22 +12663,12 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1952(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1920(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1888(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1856(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1824(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1760(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1728(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1696(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1664(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1632(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1600(%r8) @@ -12474,22 +12679,12 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1472(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1376(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1312(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1280(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1248(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1216(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%r8) @@ -12500,22 +12695,12 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1024(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 864(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 832(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 800(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 736(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 704(%r8) @@ -12526,22 +12711,12 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r8) @@ -12552,14 +12727,8 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: addq $3624, %rsp # imm = 0xE28 +; AVX2-ONLY-NEXT: addq $4264, %rsp # imm = 0x10A8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll --- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll +++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll @@ -107,9 +107,11 @@ ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: psubd %xmm0, %xmm2 -; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -117,9 +119,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: retq %sub = sub <2 x i32> , %x %bc = bitcast <2 x i32> %sub to <8 x i8> @@ -163,28 +164,32 @@ ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-NEXT: vmulpd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmulpd %xmm2, %xmm2, %xmm1 -; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vmulpd %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: fmul_v2f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-NEXT: vmulpd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmulpd %xmm2, %xmm2, %xmm1 -; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: fmul_v2f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmulpd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm0 = (xmm2 * xmm2) + xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: vmulpd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX512-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %s = shufflevector <2 x double> %x, <2 x double> %y, <4 x i32> %bo = fmul fast <4 x double> %s, %s diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1517,34 +1517,36 @@ define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_2456: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_2456: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_2456: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,2] +; SSSE3-NEXT: movaps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v4i32_2456: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,2] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: shuffle_v4i32_2456: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,2] +; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_2456: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -830,20 +830,19 @@ define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) { ; AVX2-LABEL: PR34577: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u,24,25,26,27,28,29,30,31,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: PR34577: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX512-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = <23,18,7,2,20,u,3,2> ; AVX512-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0