diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -14326,6 +14326,8 @@ return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); + if (InputV.getValueType() != VT) + InputV = DAG.getBitcast(VT, InputV); InputV = ShuffleOffset(InputV); InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG); @@ -14333,6 +14335,8 @@ } assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); + if (InputV.getValueType() != VT) + InputV = DAG.getBitcast(VT, InputV); // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. @@ -15449,6 +15453,11 @@ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + if (Subtarget.hasSSE41()) + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { @@ -15487,32 +15496,35 @@ getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } + if (Subtarget.hasSSE2()) + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( + DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) { + ZExt = DAG.getBitcast(MVT::v4f32, ZExt); + return ZExt; + } + if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; - // There are special ways we can lower some single-element blends. However, we - // have custom ways we can lower more complex single-element blends below that - // we defer to if both this and BLENDPS fail to match, so restrict this to - // when the V2 input is targeting element 0 of the mask -- that is the fast - // case here. + // There are special ways we can lower some single-element blends. However, + // we have custom ways we can lower more complex single-element blends below + // that we defer to if both this and BLENDPS fail to match, so restrict this + // to when the V2 input is targeting element 0 of the mask -- that is the + // fast case here. if (NumV2Elements == 1 && Mask[0] >= 4) - if (SDValue V = lowerShuffleAsElementInsertion( - DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return V; if (Subtarget.hasSSE41()) { - if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) - return Blend; - // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) - if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, - V2, Mask, DAG)) + if (SDValue BlendPerm = + lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG)) return BlendPerm; } @@ -16875,7 +16887,7 @@ /// AVX vector shuffle types. static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - SelectionDAG &DAG) { + SelectionDAG &DAG, bool SimpleOnly) { assert(VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"); assert(V1.getSimpleValueType() == VT && "Bad operand type!"); @@ -16931,6 +16943,9 @@ // manually combine these blend masks as much as possible so that we create // a minimal number of high-level vector shuffle nodes. + if (SimpleOnly && (UseHiV1 || UseHiV2)) + return SDValue(); + // First try just blending the halves of V1 or V2. if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) return DAG.getUNDEF(SplitVT); @@ -16964,6 +16979,8 @@ }; SDValue Lo = HalfBlend(LoMask); SDValue Hi = HalfBlend(HiMask); + if (!Lo || !Hi) + return SDValue(); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } @@ -17018,7 +17035,8 @@ if (Mask[i] >= 0) LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) - return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, + /*SimpleOnly*/ false); // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This // requires that the decomposed single-input shuffles don't end up here. @@ -17224,7 +17242,8 @@ // If we're not using both lanes in each lane and the inlane mask is not // repeating, then we're better off splitting. if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask)) - return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, + /*SimpleOnly*/ false); // Flip the lanes, and shuffle the results which should now be in-lane. MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; @@ -18359,6 +18378,32 @@ Subtarget, DAG)) return Broadcast; +#if 1 + { + int Size = Mask.size(); + int LaneSize = Size / 2; + SmallVector InLaneMask(Mask); + for (int i = 0; i < Size; ++i) { + int &M = InLaneMask[i]; + if (M < 0) + continue; + if (((M % Size) / LaneSize) != (i / LaneSize)) + M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size; + } + + if (!Subtarget.hasAVX2() && + !is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask)) + if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG, + /*SimpleOnly*/ true)) + return R; + } + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) { + ZExt = DAG.getBitcast(MVT::v8f32, ZExt); + return ZExt; + } +#endif + // If the shuffle mask is repeated in each 128-bit lane, we have many more // options to efficiently lower the shuffle. SmallVector RepeatedMask; @@ -18851,7 +18896,7 @@ return V; if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; - return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); } MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), @@ -19086,6 +19131,16 @@ return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } + if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + + if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( + DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) { + ZExt = DAG.getBitcast(MVT::v16f32, ZExt); + return ZExt; + } + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( @@ -19094,15 +19149,14 @@ // If we have a single input shuffle with different shuffle patterns in the // 128-bit lanes and don't lane cross, use variable mask VPERMILPS. - if (V2.isUndef() && - !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) { + if (V2.isUndef() && !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) { SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask); } // If we have AVX512F support, we can use VEXPAND. - if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, - V1, V2, DAG, Subtarget)) + if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, V1, V2, + DAG, Subtarget)) return V; return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG); @@ -19403,7 +19457,7 @@ if (Subtarget.hasVBMI()) return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG); - return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false); } /// High-level routine to lower various 512-bit x86 vector shuffles. @@ -19448,7 +19502,7 @@ if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; - return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); } if (VT == MVT::v32f16) { diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -4,14 +4,23 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefixes=CHECK,SKX %s define <16 x float> @test1(<16 x float> %x, ptr %br, float %y) nounwind { -; CHECK-LABEL: test1: -; CHECK: ## %bb.0: -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 -; CHECK-NEXT: vbroadcastss %xmm1, %zmm1 -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,15] -; CHECK-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: test1: +; KNL: ## %bb.0: +; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] +; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0 +; KNL-NEXT: movw $16384, %ax ## imm = 0x4000 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test1: +; SKX: ## %bb.0: +; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] +; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0 +; SKX-NEXT: movw $16384, %ax ## imm = 0x4000 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} +; SKX-NEXT: retq %rrr = load float, ptr %br %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1 %rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -305,19 +305,15 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: ; ALL: # %bb.0: -; ALL-NEXT: vmovups (%rdi), %zmm1 -; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 +; ALL-NEXT: vmovdqu64 (%rdi), %zmm0 +; ALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups (%eax), %zmm1 -; X86-AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X86-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; X86-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 +; X86-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0 +; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr3 = getelementptr inbounds float, ptr %ptr, i64 3 %ptrC = getelementptr inbounds float, ptr %ptr, i64 12 diff --git a/llvm/test/CodeGen/X86/pr43866.ll b/llvm/test/CodeGen/X86/pr43866.ll --- a/llvm/test/CodeGen/X86/pr43866.ll +++ b/llvm/test/CodeGen/X86/pr43866.ll @@ -15,12 +15,9 @@ ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,0],xmm0[1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,0],xmm0[0,0] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[0,0],ymm1[6,4],ymm0[4,4] ; CHECK-NEXT: vmovaps %ymm0, (%rsp) ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -491,16 +491,27 @@ ; ; AVX2-LABEL: shuffle_v8f32_091b2d3f: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> -; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v8f32_091b2d3f: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15] -; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v8f32_091b2d3f: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_091b2d3f: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15] +; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_091b2d3f: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2820,17 +2820,17 @@ ; SSE2-LABEL: PR30264: ; SSE2: # %bb.0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],mem[1] +; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: PR30264: ; SSSE3: # %bb.0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] -; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],mem[1] +; SSSE3-NEXT: movapd %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: PR30264: diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -4677,20 +4677,16 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2: @@ -7005,26 +7001,19 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2: