diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34171,11 +34171,18 @@ unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); // Match against a VZEXT_MOVL vXi32 zero-extending instruction. - if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) && - isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) { - Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; - return true; + if (MaskEltSize == 32 && Mask[0] == 0) { + if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) { + Shuffle = X86ISD::VZEXT_MOVL; + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; + return true; + } + if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR && + isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { + Shuffle = X86ISD::VZEXT_MOVL; + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; + return true; + } } // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction. @@ -35032,16 +35039,30 @@ // from a scalar. // TODO: Handle other insertions here as well? if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 && - MaskEltSizeInBits == 32 && Subtarget.hasSSE41() && - !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) { - SDValue SrcV1 = V1, SrcV2 = V2; - if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) && - SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) { + Subtarget.hasSSE41() && !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) { + if (MaskEltSizeInBits == 32) { + SDValue SrcV1 = V1, SrcV2 = V2; + if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, + DAG) && + SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) { + if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) + return SDValue(); // Nothing to do! + Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, + DAG.getBitcast(MVT::v4f32, SrcV1), + DAG.getBitcast(MVT::v4f32, SrcV2), + DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); + return DAG.getBitcast(RootVT, Res); + } + } + if (MaskEltSizeInBits == 64 && isTargetShuffleEquivalent(Mask, {0, 2}) && + V2.getOpcode() == ISD::SCALAR_TO_VECTOR && + V2.getScalarValueSizeInBits() <= 32) { if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) return SDValue(); // Nothing to do! + PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0); Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, - DAG.getBitcast(MVT::v4f32, SrcV1), - DAG.getBitcast(MVT::v4f32, SrcV2), + DAG.getBitcast(MVT::v4f32, V1), + DAG.getBitcast(MVT::v4f32, V2), DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } @@ -35704,6 +35725,14 @@ return DAG.getBitcast(VT, CstOp); } +namespace llvm { + namespace X86 { + enum { + MaxShuffleCombineDepth = 8 + }; + }; +}; // namespace llvm + /// Fully generic combining of x86 shuffle instructions. /// /// This should be the last combine run over the x86 shuffle instructions. Once @@ -35736,8 +35765,8 @@ static SDValue combineX86ShufflesRecursively( ArrayRef SrcOps, int SrcOpIndex, SDValue Root, ArrayRef RootMask, ArrayRef SrcNodes, unsigned Depth, - bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { + unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"); @@ -35747,8 +35776,7 @@ // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. - const unsigned MaxRecursionDepth = 8; - if (Depth >= MaxRecursionDepth) + if (Depth >= MaxDepth) return SDValue(); // Directly rip through bitcasts to find the underlying operand. @@ -35947,7 +35975,7 @@ // shuffles to avoid constant pool bloat. // Don't recurse if we already have more source ops than we can combine in // the remaining recursion depth. - if (Ops.size() < (MaxRecursionDepth - Depth)) { + if (Ops.size() < (MaxDepth - Depth)) { for (int i = 0, e = Ops.size(); i < e; ++i) { // For empty roots, we need to resolve zeroable elements before combining // them with other shuffles. @@ -35959,7 +35987,7 @@ SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) AllowVar = AllowVariableMask; if (SDValue Res = combineX86ShufflesRecursively( - Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, + Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth, HasVariableMask, AllowVar, DAG, Subtarget)) return Res; } @@ -36011,6 +36039,7 @@ static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0, + X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget); } @@ -36316,6 +36345,7 @@ DemandedMask[i] = i; if (SDValue Res = combineX86ShufflesRecursively( {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0, + X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); @@ -37799,16 +37829,22 @@ // If we don't demand all elements, then attempt to combine to a simpler // shuffle. - // TODO: Handle other depths, but first we need to handle the fact that - // it might combine to the same shuffle. - if (!DemandedElts.isAllOnesValue() && Depth == 0) { + // We need to convert the depth to something combineX86ShufflesRecursively + // can handle - so pretend its Depth == 0 again, and reduce the max depth + // to match. This prevents combineX86ShuffleChain from returning a + // combined shuffle that's the same as the original root, causing an + // infinite loop. + if (!DemandedElts.isAllOnesValue()) { + assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range"); + SmallVector DemandedMask(NumElts, SM_SentinelUndef); for (int i = 0; i != NumElts; ++i) if (DemandedElts[i]) DemandedMask[i] = i; SDValue NewShuffle = combineX86ShufflesRecursively( - {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false, + {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth, + /*HasVarMask*/ false, /*AllowVarMask*/ true, TLO.DAG, Subtarget); if (NewShuffle) return TLO.CombineTo(Op, NewShuffle); @@ -43406,6 +43442,7 @@ if (SDValue Shuffle = combineX86ShufflesRecursively( {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1, + X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle, N->getOperand(0).getOperand(1)); diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -2746,12 +2746,12 @@ ; AVX2-NEXT: vmovq %rax, %xmm7 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] +; AVX2-NEXT: vpbroadcastw %xmm8, %xmm8 ; AVX2-NEXT: vpbroadcastw %xmm9, %xmm0 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,6],xmm8[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastw %xmm9, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3] diff --git a/llvm/test/CodeGen/X86/avx-trunc.ll b/llvm/test/CodeGen/X86/avx-trunc.ll --- a/llvm/test/CodeGen/X86/avx-trunc.ll +++ b/llvm/test/CodeGen/X86/avx-trunc.ll @@ -16,7 +16,7 @@ ; CHECK-LABEL: trunc_32_16: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -6637,7 +6637,7 @@ ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 @@ -6845,7 +6845,7 @@ ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -182,11 +182,11 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax @@ -249,10 +249,10 @@ ; SSE2-SSSE3-LABEL: v2i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll @@ -148,7 +148,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -195,7 +195,7 @@ ; SSE2-SSSE3-LABEL: v2i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -407,13 +407,13 @@ define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) { ; SSE-LABEL: extract1_i16_zext_insert0_i64_undef: ; SSE: # %bb.0: -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: extract1_i16_zext_insert0_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 1 @@ -650,8 +650,7 @@ define <2 x i64> @extract3_i16_zext_insert1_i64_undef(<8 x i16> %x) { ; SSE2-LABEL: extract3_i16_zext_insert1_i64_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlq $48, %xmm0 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -50,12 +50,11 @@ define <4 x float> @test_negative_zero_1(<4 x float> %A) { ; SSE2-LABEL: test_negative_zero_1: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/combine-fcopysign.ll b/llvm/test/CodeGen/X86/combine-fcopysign.ll --- a/llvm/test/CodeGen/X86/combine-fcopysign.ll +++ b/llvm/test/CodeGen/X86/combine-fcopysign.ll @@ -266,7 +266,7 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm1 ; SSE-NEXT: orps %xmm5, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm6[0],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: andps %xmm4, %xmm0 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -414,10 +414,10 @@ ; SSE2-LABEL: combine_vec_shl_ge_ashr_extact1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $5, %xmm1 +; SSE2-NEXT: psrad $3, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $3, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; SSE2-NEXT: psrad $5, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $8, %xmm1 ; SSE2-NEXT: psrad $4, %xmm0 @@ -476,10 +476,10 @@ ; SSE2-LABEL: combine_vec_shl_lt_ashr_extact1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $7, %xmm1 +; SSE2-NEXT: psrad $5, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $5, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; SSE2-NEXT: psrad $7, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $8, %xmm1 ; SSE2-NEXT: psrad $6, %xmm0 @@ -541,10 +541,10 @@ ; SSE2-LABEL: combine_vec_shl_gt_lshr1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $5, %xmm1 +; SSE2-NEXT: psrld $3, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $3, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; SSE2-NEXT: psrld $5, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $8, %xmm1 ; SSE2-NEXT: psrld $4, %xmm0 @@ -606,10 +606,10 @@ ; SSE2-LABEL: combine_vec_shl_le_lshr1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $7, %xmm1 +; SSE2-NEXT: psrld $5, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $5, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; SSE2-NEXT: psrld $7, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $8, %xmm1 ; SSE2-NEXT: psrld $6, %xmm0 diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -1051,7 +1051,7 @@ ; SSE-SLOW-NEXT: addps %xmm1, %xmm2 ; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm3 = xmm1[0,0,2,2] ; SSE-SLOW-NEXT: addps %xmm1, %xmm3 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[0,0] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[0,3] ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSE-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -85,9 +85,9 @@ define <4 x i32> @elt3_v4i32(i32 %x) { ; X32SSE2-LABEL: elt3_v4i32: ; X32SSE2: # %bb.0: -; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = <42,1,2,u> ; X32SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = <42,1,2,u> +; X32SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X32SSE2-NEXT: retl ; @@ -95,7 +95,7 @@ ; X64SSE2: # %bb.0: ; X64SSE2-NEXT: movd %edi, %xmm1 ; X64SSE2-NEXT: movaps {{.*#+}} xmm0 = <42,1,2,u> -; X64SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X64SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X64SSE2-NEXT: retq ; @@ -166,14 +166,14 @@ ; X32SSE2: # %bb.0: ; X32SSE2-NEXT: movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> ; X32SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; X32SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; X32SSE2-NEXT: retl ; ; X64SSE2-LABEL: elt1_v4f32: ; X64SSE2: # %bb.0: ; X64SSE2-NEXT: movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> -; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; X64SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; X64SSE2-NEXT: retq ; @@ -237,9 +237,9 @@ define <8 x i32> @elt7_v8i32(i32 %x) { ; X32SSE2-LABEL: elt7_v8i32: ; X32SSE2: # %bb.0: -; X32SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,6,u> ; X32SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; X32SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,6,u> +; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X32SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3] ; X32SSE2-NEXT: retl @@ -248,7 +248,7 @@ ; X64SSE2: # %bb.0: ; X64SSE2-NEXT: movd %edi, %xmm0 ; X64SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,6,u> -; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X64SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; X64SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3] ; X64SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/insertelement-shuffle.ll b/llvm/test/CodeGen/X86/insertelement-shuffle.ll --- a/llvm/test/CodeGen/X86/insertelement-shuffle.ll +++ b/llvm/test/CodeGen/X86/insertelement-shuffle.ll @@ -9,7 +9,7 @@ ; X86: # %bb.0: ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X86-NEXT: vpbroadcastd %xmm1, %xmm1 ; X86-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X86-NEXT: retl ; @@ -17,7 +17,7 @@ ; X64: # %bb.0: ; X64-NEXT: vmovd %edi, %xmm1 ; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X64-NEXT: vpbroadcastd %xmm1, %xmm1 ; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X64-NEXT: retq %ins1 = insertelement <2 x i16> undef, i16 %x0, i32 0 diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -321,20 +321,12 @@ ret <2 x double> %3 } -; TODO: Fix vpshufd+vpsrlq -> vpshufd/vpermilps define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind { -; X86-LABEL: signbits_ashr_concat_ashr_extract_sitofp: -; X86: # %bb.0: -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X86-NEXT: vcvtdq2pd %xmm0, %xmm0 -; X86-NEXT: retl -; -; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp: -; X64: # %bb.0: -; X64-NEXT: vpsrlq $32, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: vcvtdq2pd %xmm0, %xmm0 -; X64-NEXT: retq +; CHECK-LABEL: signbits_ashr_concat_ashr_extract_sitofp: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} %1 = ashr <2 x i64> %a0, %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> %3 = shufflevector <4 x i64> %a1, <4 x i64> %2, <4 x i32> diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll --- a/llvm/test/CodeGen/X86/load-partial.ll +++ b/llvm/test/CodeGen/X86/load-partial.ll @@ -346,57 +346,40 @@ } define void @PR43227(i32* %explicit_0, <8 x i32>* %explicit_1) { -; SSE2-LABEL: PR43227: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: movaps %xmm1, 672(%rsi) -; SSE2-NEXT: movaps %xmm2, 688(%rsi) -; SSE2-NEXT: retq +; SSE-LABEL: PR43227: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, 672(%rsi) +; SSE-NEXT: movdqa %xmm0, 688(%rsi) +; SSE-NEXT: retq ; -; SSSE3-LABEL: PR43227: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSSE3-NEXT: movaps %xmm1, 672(%rsi) -; SSSE3-NEXT: movaps %xmm2, 688(%rsi) -; SSSE3-NEXT: retq +; AVX1-LABEL: PR43227: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, 672(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; -; SSE41-LABEL: PR43227: -; SSE41: # %bb.0: -; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: movdqa %xmm1, 672(%rsi) -; SSE41-NEXT: movdqa %xmm0, 688(%rsi) -; SSE41-NEXT: retq -; -; AVX-LABEL: PR43227: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovaps %ymm0, 672(%rsi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2-LABEL: PR43227: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 672(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %1 = getelementptr i32, i32* %explicit_0, i64 63 %2 = bitcast i32* %1 to <3 x i32>* %3 = load <3 x i32>, <3 x i32>* %2, align 1 diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -1136,7 +1136,7 @@ ; SSE2-NEXT: je LBB4_4 ; SSE2-NEXT: LBB4_3: ## %cond.load1 ; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1222,11 +1222,9 @@ define <4 x float> @expandload_v4f32_const(float* %base, <4 x float> %src0) { ; SSE2-LABEL: expandload_v4f32_const: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss 8(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] +; SSE2-NEXT: movss 8(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[0,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1275,19 +1273,15 @@ define <16 x float> @expandload_v16f32_const(float* %base, <16 x float> %src0) { ; SSE2-LABEL: expandload_v16f32_const: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss 52(%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd 44(%rdi), %xmm4 ## xmm4 = mem[0],zero -; SSE2-NEXT: movaps %xmm4, %xmm6 -; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE2-NEXT: movss 40(%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd 32(%rdi), %xmm5 ## xmm5 = mem[0],zero -; SSE2-NEXT: movaps %xmm5, %xmm7 -; SSE2-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] ; SSE2-NEXT: movups (%rdi), %xmm0 ; SSE2-NEXT: movups 16(%rdi), %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm7[2,0] +; SSE2-NEXT: movsd 44(%rdi), %xmm4 ## xmm4 = mem[0],zero +; SSE2-NEXT: movss 52(%rdi), %xmm6 ## xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: movsd 32(%rdi), %xmm5 ## xmm5 = mem[0],zero +; SSE2-NEXT: movss 40(%rdi), %xmm7 ## xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm7[0,3] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm6[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm6[0,3] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] ; SSE2-NEXT: movaps %xmm5, %xmm2 ; SSE2-NEXT: movaps %xmm4, %xmm3 @@ -1520,7 +1514,7 @@ ; SSE2-NEXT: je LBB8_64 ; SSE2-NEXT: LBB8_63: ## %cond.load121 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,0] ; SSE2-NEXT: LBB8_64: ## %else122 ; SSE2-NEXT: movaps %xmm0, (%rax) @@ -1540,7 +1534,7 @@ ; SSE2-NEXT: je LBB8_4 ; SSE2-NEXT: LBB8_3: ## %cond.load1 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm0 @@ -1555,7 +1549,7 @@ ; SSE2-NEXT: je LBB8_8 ; SSE2-NEXT: LBB8_7: ## %cond.load9 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testb $16, %cl @@ -1568,7 +1562,7 @@ ; SSE2-NEXT: je LBB8_12 ; SSE2-NEXT: LBB8_11: ## %cond.load17 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm1[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm1 @@ -1583,7 +1577,7 @@ ; SSE2-NEXT: je LBB8_16 ; SSE2-NEXT: LBB8_15: ## %cond.load25 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testl $256, %ecx ## imm = 0x100 @@ -1596,7 +1590,7 @@ ; SSE2-NEXT: je LBB8_20 ; SSE2-NEXT: LBB8_19: ## %cond.load33 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm2[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm2 @@ -1611,7 +1605,7 @@ ; SSE2-NEXT: je LBB8_24 ; SSE2-NEXT: LBB8_23: ## %cond.load41 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testl $4096, %ecx ## imm = 0x1000 @@ -1624,7 +1618,7 @@ ; SSE2-NEXT: je LBB8_28 ; SSE2-NEXT: LBB8_27: ## %cond.load49 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm3[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm3[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm3 @@ -1639,7 +1633,7 @@ ; SSE2-NEXT: je LBB8_32 ; SSE2-NEXT: LBB8_31: ## %cond.load57 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm3[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testl $65536, %ecx ## imm = 0x10000 @@ -1652,7 +1646,7 @@ ; SSE2-NEXT: je LBB8_36 ; SSE2-NEXT: LBB8_35: ## %cond.load65 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm4[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm4 @@ -1667,7 +1661,7 @@ ; SSE2-NEXT: je LBB8_40 ; SSE2-NEXT: LBB8_39: ## %cond.load73 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm4[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testl $1048576, %ecx ## imm = 0x100000 @@ -1680,7 +1674,7 @@ ; SSE2-NEXT: je LBB8_44 ; SSE2-NEXT: LBB8_43: ## %cond.load81 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm5[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm5 @@ -1695,7 +1689,7 @@ ; SSE2-NEXT: je LBB8_48 ; SSE2-NEXT: LBB8_47: ## %cond.load89 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm5[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testl $16777216, %ecx ## imm = 0x1000000 @@ -1708,7 +1702,7 @@ ; SSE2-NEXT: je LBB8_52 ; SSE2-NEXT: LBB8_51: ## %cond.load97 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm6[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm6 @@ -1723,7 +1717,7 @@ ; SSE2-NEXT: je LBB8_56 ; SSE2-NEXT: LBB8_55: ## %cond.load105 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm6[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testl $268435456, %ecx ## imm = 0x10000000 @@ -1736,7 +1730,7 @@ ; SSE2-NEXT: je LBB8_60 ; SSE2-NEXT: LBB8_59: ## %cond.load113 ; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm7 @@ -2814,7 +2808,7 @@ ; SSE2-NEXT: je LBB10_4 ; SSE2-NEXT: LBB10_3: ## %cond.load1 ; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: movaps %xmm1, %xmm0 @@ -2829,7 +2823,7 @@ ; SSE2-NEXT: je LBB10_8 ; SSE2-NEXT: LBB10_7: ## %cond.load9 ; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -742,7 +742,7 @@ ; SSE2-NEXT: je LBB7_4 ; SSE2-NEXT: LBB7_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 @@ -834,7 +834,7 @@ ; SSE2-NEXT: je LBB8_4 ; SSE2-NEXT: LBB8_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -927,7 +927,7 @@ ; SSE2-NEXT: je LBB9_4 ; SSE2-NEXT: LBB9_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: testb $4, %al @@ -940,7 +940,7 @@ ; SSE2-NEXT: je LBB9_8 ; SSE2-NEXT: LBB9_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1051,8 +1051,8 @@ ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB10_4 ; SSE2-NEXT: LBB10_3: ## %cond.load1 -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: testb $4, %al @@ -1065,7 +1065,7 @@ ; SSE2-NEXT: je LBB10_8 ; SSE2-NEXT: LBB10_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB10_10 @@ -1076,7 +1076,7 @@ ; SSE2-NEXT: je LBB10_12 ; SSE2-NEXT: LBB10_11: ## %cond.load13 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: testb $64, %al @@ -1089,7 +1089,7 @@ ; SSE2-NEXT: je LBB10_16 ; SSE2-NEXT: LBB10_15: ## %cond.load19 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE2-NEXT: retq ; @@ -1237,7 +1237,7 @@ ; SSE2-NEXT: je LBB11_16 ; SSE2-NEXT: LBB11_15: ## %cond.load19 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE2-NEXT: LBB11_16: ## %else20 ; SSE2-NEXT: movaps %xmm2, %xmm0 @@ -1250,7 +1250,7 @@ ; SSE2-NEXT: je LBB11_4 ; SSE2-NEXT: LBB11_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: testb $4, %al @@ -1263,7 +1263,7 @@ ; SSE2-NEXT: je LBB11_8 ; SSE2-NEXT: LBB11_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB11_10 @@ -1274,7 +1274,7 @@ ; SSE2-NEXT: je LBB11_12 ; SSE2-NEXT: LBB11_11: ## %cond.load13 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: testb $64, %al @@ -2066,7 +2066,7 @@ ; SSE2-NEXT: je LBB17_4 ; SSE2-NEXT: LBB17_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 @@ -2172,7 +2172,7 @@ ; SSE2-NEXT: je LBB18_4 ; SSE2-NEXT: LBB18_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: testb $4, %al @@ -2185,7 +2185,7 @@ ; SSE2-NEXT: je LBB18_8 ; SSE2-NEXT: LBB18_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -2295,7 +2295,7 @@ ; SSE2-NEXT: je LBB19_16 ; SSE2-NEXT: LBB19_15: ## %cond.load19 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE2-NEXT: LBB19_16: ## %else20 ; SSE2-NEXT: movaps %xmm1, %xmm0 @@ -2308,7 +2308,7 @@ ; SSE2-NEXT: je LBB19_4 ; SSE2-NEXT: LBB19_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: testb $4, %al @@ -2321,7 +2321,7 @@ ; SSE2-NEXT: je LBB19_8 ; SSE2-NEXT: LBB19_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB19_10 @@ -2332,7 +2332,7 @@ ; SSE2-NEXT: je LBB19_12 ; SSE2-NEXT: LBB19_11: ## %cond.load13 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: testb $64, %al @@ -2495,8 +2495,8 @@ ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB20_4 ; SSE2-NEXT: LBB20_3: ## %cond.load1 -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: testb $4, %al @@ -2509,7 +2509,7 @@ ; SSE2-NEXT: je LBB20_8 ; SSE2-NEXT: LBB20_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB20_10 @@ -2520,7 +2520,7 @@ ; SSE2-NEXT: je LBB20_12 ; SSE2-NEXT: LBB20_11: ## %cond.load13 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: testb $64, %al @@ -2533,7 +2533,7 @@ ; SSE2-NEXT: je LBB20_16 ; SSE2-NEXT: LBB20_15: ## %cond.load19 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE2-NEXT: retq ; @@ -6120,17 +6120,15 @@ ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSE2-NEXT: retq ; ; SSE42-LABEL: mload_constmask_v4f32: ; SSE42: ## %bb.0: ; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE42-NEXT: retq @@ -6216,13 +6214,11 @@ ; SSE2-LABEL: mload_constmask_v4i32: ; SSE2: ## %bb.0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,2] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -6301,7 +6297,7 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] @@ -6311,7 +6307,7 @@ ; SSE42-LABEL: mload_constmask_v8f32: ; SSE42: ## %bb.0: ; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; SSE42-NEXT: retq @@ -6443,12 +6439,12 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -3472,7 +3472,7 @@ ; AVX1-LABEL: truncstore_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -512,16 +512,16 @@ ; SSE2-LABEL: v12i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[1,3] ; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] ; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,1] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE2-NEXT: movaps %xmm2, %xmm5 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm2[3,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] ; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,2] ; SSE2-NEXT: movaps %xmm2, 32(%rdi) @@ -908,7 +908,7 @@ ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,7,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1] ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0] @@ -1186,41 +1186,39 @@ ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # %bb.0: ; SSE2-NEXT: movups 80(%rdi), %xmm8 -; SSE2-NEXT: movups 64(%rdi), %xmm4 -; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movups 16(%rdi), %xmm6 +; SSE2-NEXT: movups 64(%rdi), %xmm3 +; SSE2-NEXT: movdqu (%rdi), %xmm1 +; SSE2-NEXT: movups 16(%rdi), %xmm5 ; SSE2-NEXT: movups 32(%rdi), %xmm10 -; SSE2-NEXT: movups 48(%rdi), %xmm12 -; SSE2-NEXT: movdqa %xmm0, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[1,0] -; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm6[0,2] -; SSE2-NEXT: movaps %xmm12, %xmm6 -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm4[0,0] -; SSE2-NEXT: movaps %xmm6, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[1,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,3,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm4[0,2] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,2],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm8[2,0] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,2],xmm1[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,0] -; SSE2-NEXT: movups %xmm12, 16(%rsi) +; SSE2-NEXT: movdqu 48(%rdi), %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: movaps %xmm10, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm5[3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm10[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm5[0,2] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movaps %xmm8, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,1],xmm3[3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm8[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[2,0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[0,3] +; SSE2-NEXT: movups %xmm5, 16(%rsi) ; SSE2-NEXT: movups %xmm11, (%rsi) -; SSE2-NEXT: movups %xmm6, 16(%rdx) -; SSE2-NEXT: movups %xmm0, (%rdx) -; SSE2-NEXT: movups %xmm5, 16(%rcx) -; SSE2-NEXT: movups %xmm7, (%rcx) +; SSE2-NEXT: movups %xmm2, 16(%rdx) +; SSE2-NEXT: movups %xmm1, (%rdx) +; SSE2-NEXT: movups %xmm6, 16(%rcx) +; SSE2-NEXT: movups %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: @@ -1420,29 +1418,29 @@ ; SSE2-NEXT: movups (%rcx), %xmm3 ; SSE2-NEXT: movups 16(%rcx), %xmm6 ; SSE2-NEXT: movaps %xmm3, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm1[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[1,3] ; SSE2-NEXT: movaps %xmm1, %xmm9 ; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[0,2] ; SSE2-NEXT: movaps %xmm5, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,2],xmm6[3,2] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] ; SSE2-NEXT: movaps %xmm6, %xmm4 ; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm7[0,2] ; SSE2-NEXT: movaps %xmm0, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,1] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] ; SSE2-NEXT: movaps %xmm6, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm0[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[1,3] ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,2] ; SSE2-NEXT: movaps %xmm8, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,2],xmm3[3,2] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] ; SSE2-NEXT: movaps %xmm3, %xmm6 ; SSE2-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm5[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,1] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2] ; SSE2-NEXT: movups %xmm3, 16(%rdi) @@ -1498,20 +1496,20 @@ ; AVX1-LABEL: interleave_24i32_in: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovups (%rdx), %xmm0 -; AVX1-NEXT: vmovups 16(%rdx), %xmm1 -; AVX1-NEXT: vmovups (%rsi), %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm0[2,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1],xmm3[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd (%rcx), %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX1-NEXT: vmovups 16(%rcx), %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,0],xmm2[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,1],xmm3[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,2] +; AVX1-NEXT: vmovups (%rsi), %xmm1 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm2[0,2] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rcx), %ymm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX1-NEXT: vmovups 16(%rcx), %xmm1 +; AVX1-NEXT: vmovups 16(%rdx), %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,0],xmm1[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,1],xmm3[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,2] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vbroadcastsd 24(%rsi), %ymm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] @@ -1589,20 +1587,20 @@ ; XOP-NEXT: vmovups (%rcx), %ymm1 ; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[u,3],ymm0[4],ymm1[u,4],ymm0[5] ; XOP-NEXT: vmovups (%rdx), %xmm1 -; XOP-NEXT: vmovups 16(%rdx), %xmm2 -; XOP-NEXT: vmovups (%rsi), %xmm3 -; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm1[2,0] -; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0] -; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1] -; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; XOP-NEXT: vbroadcastsd (%rcx), %ymm3 -; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; XOP-NEXT: vmovups 16(%rcx), %xmm3 -; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,0],xmm3[3,0] -; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,1],xmm4[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,0] -; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,2] +; XOP-NEXT: vmovups (%rsi), %xmm2 +; XOP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2] +; XOP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,1] +; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; XOP-NEXT: vbroadcastsd (%rcx), %ymm2 +; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; XOP-NEXT: vmovups 16(%rcx), %xmm2 +; XOP-NEXT: vmovups 16(%rdx), %xmm3 +; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,0],xmm2[3,0] +; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,1],xmm4[0,2] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[1,0] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,2] ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; XOP-NEXT: vbroadcastsd 24(%rsi), %ymm3 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -23,33 +23,19 @@ ; SSE-NEXT: movd %xmm1, (%rdi) ; SSE-NEXT: retq ; -; AVX1-LABEL: insert_v7i8_v2i16_2: -; AVX1: # %bb.0: -; AVX1-NEXT: movl (%rsi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: movq (%rdi), %rcx -; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-NEXT: shrq $48, %rcx -; AVX1-NEXT: movb %cl, 6(%rdi) -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: movw %ax, 4(%rdi) -; AVX1-NEXT: vmovd %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: insert_v7i8_v2i16_2: -; AVX2: # %bb.0: -; AVX2-NEXT: movl (%rsi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: movq (%rdi), %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: movb %cl, 6(%rdi) -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: movw %ax, 4(%rdi) -; AVX2-NEXT: vmovd %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: insert_v7i8_v2i16_2: +; AVX: # %bb.0: +; AVX-NEXT: movl (%rsi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: movq (%rdi), %rcx +; AVX-NEXT: vmovq %rcx, %xmm1 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: shrq $48, %rcx +; AVX-NEXT: movb %cl, 6(%rdi) +; AVX-NEXT: shrl $16, %eax +; AVX-NEXT: movw %ax, 4(%rdi) +; AVX-NEXT: vmovd %xmm0, (%rdi) +; AVX-NEXT: retq ; ; AVX512-LABEL: insert_v7i8_v2i16_2: ; AVX512: # %bb.0: @@ -64,20 +50,6 @@ ; AVX512-NEXT: movw %ax, 4(%rdi) ; AVX512-NEXT: vmovd %xmm0, (%rdi) ; AVX512-NEXT: retq -; -; XOP-LABEL: insert_v7i8_v2i16_2: -; XOP: # %bb.0: -; XOP-NEXT: movl (%rsi), %eax -; XOP-NEXT: vmovd %eax, %xmm0 -; XOP-NEXT: movq (%rdi), %rcx -; XOP-NEXT: vmovq %rcx, %xmm1 -; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,1,2,3],xmm1[6,7,u,u,u,u,u,u,u,u] -; XOP-NEXT: shrq $48, %rcx -; XOP-NEXT: movb %cl, 6(%rdi) -; XOP-NEXT: shrl $16, %eax -; XOP-NEXT: movw %ax, 4(%rdi) -; XOP-NEXT: vmovd %xmm1, (%rdi) -; XOP-NEXT: retq %1 = load <2 x i16>, <2 x i16> *%a1 %2 = bitcast <2 x i16> %1 to <4 x i8> %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <7 x i32> diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -1236,70 +1236,67 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_sext: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: movdqa %xmm1, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm14 ; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm13 -; SSE2-NEXT: movdqa %xmm10, %xmm9 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm14, %xmm10 +; SSE2-NEXT: movdqa %xmm14, %xmm8 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pxor %xmm15, %xmm15 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm15 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm0, %xmm11 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE2-NEXT: pxor %xmm14, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm14 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] -; SSE2-NEXT: movdqa %xmm15, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm12, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm0, %xmm7 -; SSE2-NEXT: paddq %xmm4, %xmm7 -; SSE2-NEXT: psllq $32, %xmm7 -; SSE2-NEXT: pmuludq %xmm12, %xmm0 -; SSE2-NEXT: paddq %xmm7, %xmm0 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE2-NEXT: pmuludq %xmm1, %xmm15 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm11, %xmm6 -; SSE2-NEXT: paddq %xmm15, %xmm6 -; SSE2-NEXT: psllq $32, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE2-NEXT: pcmpgtd %xmm15, %xmm13 +; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm15, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm4 +; SSE2-NEXT: paddq %xmm6, %xmm4 +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm15, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm11, %xmm5 +; SSE2-NEXT: paddq %xmm4, %xmm5 +; SSE2-NEXT: psllq $32, %xmm5 ; SSE2-NEXT: pmuludq %xmm11, %xmm1 -; SSE2-NEXT: paddq %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm13, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE2-NEXT: paddq %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm14, %xmm5 ; SSE2-NEXT: paddq %xmm4, %xmm5 ; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm10, %xmm2 +; SSE2-NEXT: pmuludq %xmm14, %xmm2 ; SSE2-NEXT: paddq %xmm5, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; SSE2-NEXT: pmuludq %xmm3, %xmm13 -; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm9, %xmm14 -; SSE2-NEXT: paddq %xmm13, %xmm14 -; SSE2-NEXT: psllq $32, %xmm14 -; SSE2-NEXT: pmuludq %xmm9, %xmm3 -; SSE2-NEXT: paddq %xmm14, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm5 +; SSE2-NEXT: paddq %xmm4, %xmm5 +; SSE2-NEXT: psllq $32, %xmm5 +; SSE2-NEXT: pmuludq %xmm8, %xmm3 +; SSE2-NEXT: paddq %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v8i64_sext: @@ -1350,8 +1347,8 @@ define <2 x i64> @pmuldq_square(<2 x i64> %x) { ; SSE2-LABEL: pmuldq_square: ; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -393,11 +393,10 @@ ; SSE2-LABEL: mulhsw_v4i16_ashr: ; SSE2: # %bb.0: ; SSE2-NEXT: pmulhw %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v4i16_ashr: @@ -1505,34 +1504,30 @@ define <8 x i64> @mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: mulhuw_v8i16_lshr_i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,1,3,3] ; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE2-NEXT: pmuludq %xmm7, %xmm0 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: pmuludq %xmm6, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE2-NEXT: pmuludq %xmm4, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm6, %xmm3 ; SSE2-NEXT: psrlq $16, %xmm0 -; SSE2-NEXT: psrlq $16, %xmm1 +; SSE2-NEXT: psrlq $16, %xmm4 ; SSE2-NEXT: psrlq $16, %xmm2 ; SSE2-NEXT: psrlq $16, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhuw_v8i16_lshr_i64: @@ -1571,71 +1566,66 @@ define <8 x i64> @mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: mulhsw_v8i16_lshr_i64: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm12 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm14, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm14 -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm11 +; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm12 +; SSE2-NEXT: movdqa %xmm7, %xmm9 +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE2-NEXT: pmuludq %xmm7, %xmm3 -; SSE2-NEXT: pmuludq %xmm1, %xmm7 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; SSE2-NEXT: pmuludq %xmm1, %xmm6 -; SSE2-NEXT: paddq %xmm6, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm13 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm3 +; SSE2-NEXT: pmuludq %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: paddq %xmm2, %xmm3 ; SSE2-NEXT: psllq $32, %xmm3 -; SSE2-NEXT: paddq %xmm7, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE2-NEXT: paddq %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3] ; SSE2-NEXT: pmuludq %xmm11, %xmm2 -; SSE2-NEXT: pmuludq %xmm13, %xmm11 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm13, %xmm4 -; SSE2-NEXT: paddq %xmm4, %xmm2 +; SSE2-NEXT: pmuludq %xmm0, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: paddq %xmm1, %xmm2 ; SSE2-NEXT: psllq $32, %xmm2 ; SSE2-NEXT: paddq %xmm11, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE2-NEXT: pmuludq %xmm12, %xmm1 -; SSE2-NEXT: pmuludq %xmm5, %xmm12 -; SSE2-NEXT: movdqa %xmm14, %xmm4 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE2-NEXT: pmuludq %xmm5, %xmm4 -; SSE2-NEXT: paddq %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm6, %xmm1 +; SSE2-NEXT: pmuludq %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm7, %xmm0 +; SSE2-NEXT: paddq %xmm0, %xmm1 ; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: paddq %xmm12, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm9, %xmm0 -; SSE2-NEXT: pmuludq %xmm10, %xmm9 -; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm10, %xmm14 -; SSE2-NEXT: paddq %xmm14, %xmm0 +; SSE2-NEXT: paddq %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm0 +; SSE2-NEXT: pmuludq %xmm9, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm9, %xmm4 +; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: paddq %xmm9, %xmm0 +; SSE2-NEXT: paddq %xmm8, %xmm0 ; SSE2-NEXT: psrlq $16, %xmm0 ; SSE2-NEXT: psrlq $16, %xmm1 ; SSE2-NEXT: psrlq $16, %xmm2 @@ -1678,71 +1668,66 @@ define <8 x i64> @mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: mulhsw_v8i16_ashr_i64: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm12 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm14, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm14 -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm10 -; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm11 +; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; SSE2-NEXT: pmuludq %xmm7, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm13 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm4 +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; SSE2-NEXT: pmuludq %xmm1, %xmm5 -; SSE2-NEXT: paddq %xmm5, %xmm6 -; SSE2-NEXT: psllq $32, %xmm6 -; SSE2-NEXT: paddq %xmm7, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm11, %xmm4 -; SSE2-NEXT: pmuludq %xmm13, %xmm11 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm13, %xmm2 -; SSE2-NEXT: paddq %xmm2, %xmm4 +; SSE2-NEXT: paddq %xmm7, %xmm4 ; SSE2-NEXT: psllq $32, %xmm4 -; SSE2-NEXT: paddq %xmm11, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE2-NEXT: pmuludq %xmm12, %xmm1 -; SSE2-NEXT: pmuludq %xmm3, %xmm12 -; SSE2-NEXT: movdqa %xmm14, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE2-NEXT: pmuludq %xmm3, %xmm2 +; SSE2-NEXT: paddq %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm13[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm11, %xmm7 +; SSE2-NEXT: pmuludq %xmm6, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm6, %xmm1 +; SSE2-NEXT: paddq %xmm1, %xmm7 +; SSE2-NEXT: psllq $32, %xmm7 +; SSE2-NEXT: paddq %xmm11, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm1 +; SSE2-NEXT: pmuludq %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm2 ; SSE2-NEXT: paddq %xmm2, %xmm1 ; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: paddq %xmm12, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm9, %xmm0 -; SSE2-NEXT: pmuludq %xmm10, %xmm9 -; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm10, %xmm14 -; SSE2-NEXT: paddq %xmm14, %xmm0 +; SSE2-NEXT: paddq %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm0 +; SSE2-NEXT: pmuludq %xmm9, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm9, %xmm2 +; SSE2-NEXT: paddq %xmm2, %xmm0 ; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: paddq %xmm9, %xmm0 +; SSE2-NEXT: paddq %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] @@ -1755,18 +1740,18 @@ ; SSE2-NEXT: psrlq $16, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; SSE2-NEXT: psrlq $16, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; SSE2-NEXT: psrlq $16, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] -; SSE2-NEXT: psrlq $16, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] +; SSE2-NEXT: psrlq $16, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v8i16_ashr_i64: diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll --- a/llvm/test/CodeGen/X86/pr29112.ll +++ b/llvm/test/CodeGen/X86/pr29112.ll @@ -34,7 +34,7 @@ ; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,3,3,3] ; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3] +; CHECK-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3] ; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3] @@ -42,7 +42,7 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8 -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3],xmm11[3] +; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] ; CHECK-NEXT: vaddps %xmm2, %xmm14, %xmm2 ; CHECK-NEXT: vmovaps %xmm13, %xmm1 diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll --- a/llvm/test/CodeGen/X86/promote-cmp.ll +++ b/llvm/test/CodeGen/X86/promote-cmp.ll @@ -29,13 +29,11 @@ ; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] -; SSE2-NEXT: movaps {{.*#+}} xmm4 = <1,1,u,0> -; SSE2-NEXT: xorps %xmm5, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE2-NEXT: psllq $63, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm4 +; SSE2-NEXT: psllq $63, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm5 ; SSE2-NEXT: por %xmm5, %xmm1 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -558,7 +558,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] @@ -908,7 +908,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] @@ -1038,7 +1038,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -899,7 +899,7 @@ ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm1 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi @@ -935,7 +935,7 @@ ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE-NEXT: psraw $8, %xmm1 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) ; X64-SSE-NEXT: retq @@ -1472,7 +1472,7 @@ ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl @@ -1497,7 +1497,7 @@ ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE-NEXT: psraw $8, %xmm0 ; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shuffle-of-insert.ll b/llvm/test/CodeGen/X86/shuffle-of-insert.ll --- a/llvm/test/CodeGen/X86/shuffle-of-insert.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-insert.ll @@ -30,7 +30,7 @@ ; SSE2-LABEL: ins_elt_1: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; @@ -80,7 +80,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: movd %edi, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; @@ -153,7 +153,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: movd %edi, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; @@ -176,7 +176,7 @@ ; SSE2-LABEL: ins_elt_3_to_1: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -13,19 +13,7 @@ ; SSE2-LABEL: shuffle_v16i8_to_v8i8_1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movq %xmm0, (%rsi) ; SSE2-NEXT: retq @@ -670,10 +658,10 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -187,7 +187,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -435,10 +435,9 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -1065,31 +1065,29 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2454267027,2147483649,1374389535] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 ; CHECK-SSE2-NEXT: psrad $5, %xmm3 @@ -1391,31 +1389,29 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2454267027,0,1374389535] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 ; CHECK-SSE2-NEXT: psrad $5, %xmm3 @@ -2694,31 +2690,29 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2147483649,0,1717986919] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1717986919] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 ; CHECK-SSE2-NEXT: psrad $1, %xmm3 @@ -2965,31 +2959,29 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2147483649,0,1374389535] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1374389535] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 ; CHECK-SSE2-NEXT: psrad $5, %xmm3 diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll --- a/llvm/test/CodeGen/X86/test-shrink-bug.ll +++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll @@ -70,8 +70,8 @@ ; CHECK-X64-NEXT: je .LBB1_3 ; CHECK-X64-NEXT: # %bb.1: ; CHECK-X64-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 -; CHECK-X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax +; CHECK-X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] +; CHECK-X64-NEXT: pextrw $4, %xmm0, %eax ; CHECK-X64-NEXT: testb $1, %al ; CHECK-X64-NEXT: jne .LBB1_3 ; CHECK-X64-NEXT: # %bb.2: # %no diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll --- a/llvm/test/CodeGen/X86/trunc-subvector.ll +++ b/llvm/test/CodeGen/X86/trunc-subvector.ll @@ -79,12 +79,8 @@ ; ; AVX2-LABEL: test5: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -181,12 +177,8 @@ ; ; AVX2-LABEL: test10: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll --- a/llvm/test/CodeGen/X86/udiv_fix.ll +++ b/llvm/test/CodeGen/X86/udiv_fix.ll @@ -238,36 +238,37 @@ ; X64-LABEL: vec: ; X64: # %bb.0: ; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: movdqa %xmm1, %xmm3 +; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: movq %xmm3, %rcx +; X64-NEXT: movdqa %xmm0, %xmm4 ; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; X64-NEXT: movq %xmm4, %rcx -; X64-NEXT: movdqa %xmm0, %xmm5 -; X64-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; X64-NEXT: psllq $31, %xmm5 -; X64-NEXT: movq %xmm5, %rax +; X64-NEXT: psllq $31, %xmm4 +; X64-NEXT: movq %xmm4, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm3 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X64-NEXT: movq %xmm4, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] ; X64-NEXT: movq %xmm4, %rax +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: movq %xmm4, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm4 ; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-NEXT: movq %xmm1, %rcx ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; X64-NEXT: movq %xmm2, %rcx ; X64-NEXT: psllq $31, %xmm0 ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, %rcx ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: movq %xmm1, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm0 diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -323,22 +323,23 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-LABEL: vec: ; X64: # %bb.0: -; X64-NEXT: pxor %xmm8, %xmm8 +; X64-NEXT: pxor %xmm3, %xmm3 ; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; X64-NEXT: movq %xmm2, %rcx -; X64-NEXT: movdqa %xmm0, %xmm4 -; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; X64-NEXT: paddq %xmm4, %xmm4 -; X64-NEXT: psllq $31, %xmm4 -; X64-NEXT: movq %xmm4, %rax +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X64-NEXT: paddq %xmm2, %xmm2 +; X64-NEXT: psllq $31, %xmm2 +; X64-NEXT: movq %xmm2, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm7 ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X64-NEXT: movq %xmm2, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; X64-NEXT: movq %xmm2, %rax +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: movq %xmm2, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm2 @@ -346,13 +347,13 @@ ; X64-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; X64-NEXT: movdqa %xmm7, %xmm2 ; X64-NEXT: pxor %xmm4, %xmm2 -; X64-NEXT: movdqa {{.*#+}} xmm9 = [9223372043297226751,9223372043297226751] -; X64-NEXT: movdqa %xmm9, %xmm6 +; X64-NEXT: movdqa {{.*#+}} xmm8 = [9223372043297226751,9223372043297226751] +; X64-NEXT: movdqa %xmm8, %xmm6 ; X64-NEXT: pcmpgtd %xmm2, %xmm6 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] -; X64-NEXT: pcmpeqd %xmm9, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; X64-NEXT: pcmpeqd %xmm8, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; X64-NEXT: pand %xmm3, %xmm5 +; X64-NEXT: pand %xmm9, %xmm5 ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] ; X64-NEXT: por %xmm5, %xmm2 ; X64-NEXT: movdqa {{.*#+}} xmm6 = [8589934591,8589934591] @@ -360,28 +361,28 @@ ; X64-NEXT: pandn %xmm6, %xmm2 ; X64-NEXT: por %xmm7, %xmm2 ; X64-NEXT: psrlq $1, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; X64-NEXT: movq %xmm1, %rcx -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X64-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] +; X64-NEXT: movq %xmm3, %rcx ; X64-NEXT: paddq %xmm0, %xmm0 ; X64-NEXT: psllq $31, %xmm0 ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, %rcx ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: movq %xmm1, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; X64-NEXT: pxor %xmm3, %xmm4 -; X64-NEXT: movdqa %xmm9, %xmm0 +; X64-NEXT: movdqa %xmm8, %xmm0 ; X64-NEXT: pcmpgtd %xmm4, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; X64-NEXT: pcmpeqd %xmm9, %xmm4 +; X64-NEXT: pcmpeqd %xmm8, %xmm4 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; X64-NEXT: pand %xmm1, %xmm4 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -200,26 +200,24 @@ ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -300,26 +298,24 @@ ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 @@ -638,13 +634,12 @@ ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -722,25 +717,23 @@ ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -857,15 +850,15 @@ ; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; CHECK-SSE41-NEXT: psrld $5, %xmm3 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 @@ -880,11 +873,11 @@ ; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 ; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -989,25 +982,22 @@ ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -1098,20 +1088,18 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movaps %xmm0, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: movaps %xmm2, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,1,100] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; CHECK-SSE2-NEXT: psrld $5, %xmm1 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,14,1,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -1208,13 +1196,12 @@ ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -1292,25 +1279,23 @@ ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -1427,15 +1412,15 @@ ; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; CHECK-SSE41-NEXT: psrld $5, %xmm3 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 @@ -1450,11 +1435,11 @@ ; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 ; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -1512,17 +1497,17 @@ ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 ; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,16,5] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: psrld $31, %xmm3 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,5] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -1536,15 +1521,15 @@ ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm3 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 @@ -1558,11 +1543,11 @@ ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -1614,24 +1599,23 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,14] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: psrld $31, %xmm4 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [14,4294967295,16,14] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -1648,15 +1632,15 @@ ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 ; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] ; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm4 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm4 -; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 @@ -1672,11 +1656,11 @@ ; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -1722,25 +1706,28 @@ ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE2-NEXT: psrld $5, %xmm2 -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 ; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm4[2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,100] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrld $5, %xmm4 +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -1758,13 +1745,13 @@ ; CHECK-SSE41-NEXT: psrld $31, %xmm3 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 @@ -1780,9 +1767,10 @@ ; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -1875,24 +1863,21 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] ; CHECK-SSE2-NEXT: psrld $31, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[3,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,1,14] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -1996,9 +1981,7 @@ ; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] @@ -2099,18 +2082,16 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,16,1,5] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -2198,23 +2179,20 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,16,1,14] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -2315,9 +2293,7 @@ ; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] @@ -2408,23 +2384,25 @@ ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,1] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: psrld $31, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -2435,18 +2413,18 @@ ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] ; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrld $2, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrld $31, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 @@ -2456,9 +2434,10 @@ ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -2507,26 +2486,27 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 +; CHECK-SSE2-NEXT: psrld $1, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,1] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [14,4294967295,16,1] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: psrld $31, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 @@ -2542,18 +2522,18 @@ ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2147483649,268435456,0] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE41-NEXT: psrld $2, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrld $31, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 @@ -2565,9 +2545,10 @@ ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2147483649,268435456,0] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll @@ -332,13 +332,11 @@ ; ; CHECK-AVX2-LABEL: t32_tautological: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] ; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_insert-2.ll b/llvm/test/CodeGen/X86/vec_insert-2.ll --- a/llvm/test/CodeGen/X86/vec_insert-2.ll +++ b/llvm/test/CodeGen/X86/vec_insert-2.ll @@ -6,13 +6,13 @@ ; X32-LABEL: t1: ; X32: # %bb.0: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X32-NEXT: retl ; ; X64-LABEL: t1: ; X64: # %bb.0: -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; X64-NEXT: movaps %xmm1, %xmm0 ; X64-NEXT: retq @@ -24,14 +24,14 @@ ; X32-LABEL: t2: ; X32: # %bb.0: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X32-NEXT: retl ; ; X64-LABEL: t2: ; X64: # %bb.0: ; X64-NEXT: movd %edi, %xmm1 -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X64-NEXT: retq %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 3 diff --git a/llvm/test/CodeGen/X86/vec_insert-3.ll b/llvm/test/CodeGen/X86/vec_insert-3.ll --- a/llvm/test/CodeGen/X86/vec_insert-3.ll +++ b/llvm/test/CodeGen/X86/vec_insert-3.ll @@ -6,11 +6,9 @@ ; X32-LABEL: t1: ; X32: # %bb.0: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: movaps %xmm0, %xmm2 -; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0] -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; X32-NEXT: retl ; ; X64-LABEL: t1: diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll --- a/llvm/test/CodeGen/X86/vec_insert-5.ll +++ b/llvm/test/CodeGen/X86/vec_insert-5.ll @@ -11,7 +11,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shll $12, %ecx ; X32-NEXT: movd %ecx, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X32-NEXT: psllq $32, %xmm0 ; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: retl ; @@ -19,7 +19,7 @@ ; X64: # %bb.0: ; X64-NEXT: shll $12, %edi ; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X64-NEXT: psllq $32, %xmm0 ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq %tmp12 = shl i32 %a, 12 @@ -36,7 +36,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: xorps %xmm0, %xmm0 ; X32-NEXT: xorps %xmm1, %xmm1 -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0] +; X32-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; X32-NEXT: retl ; @@ -44,7 +44,7 @@ ; X64: # %bb.0: ; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0] +; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; X64-NEXT: retq %tmp1 = load <4 x float>, <4 x float>* %P diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -189,7 +189,7 @@ define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) { ; SSE2-LABEL: sitofp_2i16_to_2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -213,7 +213,7 @@ define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) { ; SSE2-LABEL: sitofp_8i16_to_2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -246,7 +246,7 @@ ; SSE2-LABEL: sitofp_2i8_to_2f64: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -271,7 +271,7 @@ ; SSE2-LABEL: sitofp_16i8_to_2f64: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -3071,7 +3071,7 @@ ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_set-6.ll b/llvm/test/CodeGen/X86/vec_set-6.ll --- a/llvm/test/CodeGen/X86/vec_set-6.ll +++ b/llvm/test/CodeGen/X86/vec_set-6.ll @@ -12,7 +12,7 @@ ; ; X64-LABEL: test: ; X64: # %bb.0: -; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1] diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -275,7 +275,7 @@ ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4 ; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 @@ -711,7 +711,7 @@ ; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm5 ; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-pack-256.ll b/llvm/test/CodeGen/X86/vector-pack-256.ll --- a/llvm/test/CodeGen/X86/vector-pack-256.ll +++ b/llvm/test/CodeGen/X86/vector-pack-256.ll @@ -228,7 +228,7 @@ ; AVX1-NEXT: vpsrld $17, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -297,7 +297,7 @@ ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -19,8 +19,7 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -34,7 +33,7 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -48,7 +47,7 @@ ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -62,7 +61,7 @@ ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -107,8 +106,7 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -131,7 +129,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -155,7 +153,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -179,7 +177,7 @@ ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -203,7 +201,7 @@ ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -274,8 +272,7 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -315,7 +312,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -347,7 +344,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -380,7 +377,7 @@ ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -413,7 +410,7 @@ ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -527,8 +524,7 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -602,7 +598,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -650,7 +646,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -691,7 +687,7 @@ ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -732,7 +728,7 @@ ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -1691,17 +1687,16 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $8, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -1871,17 +1866,16 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $8, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -2091,17 +2085,16 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $8, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -2386,17 +2379,16 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $8, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -286,7 +286,7 @@ ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -291,7 +291,7 @@ ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1865,19 +1865,19 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06(<16 x i8> %a) { ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: ; SSE: # %bb.0: -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE-NEXT: psrlq $8, %xmm0 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; AVX1-NEXT: vpsrlq $8, %xmm0, %xmm0 ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; AVX2-SLOW-NEXT: vpsrlq $8, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX2-SLOW-NEXT: retq ; @@ -1893,7 +1893,7 @@ ; ; XOP-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: ; XOP: # %bb.0: -; XOP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; XOP-NEXT: vpsrlq $8, %xmm0, %xmm0 ; XOP-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; XOP-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> , <16 x i32> @@ -2459,12 +2459,10 @@ ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movzbl (%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -332,19 +332,19 @@ define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_0124: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_0124: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_0124: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3-NEXT: retq ; @@ -377,19 +377,19 @@ define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_0142: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_0142: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_0142: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSSE3-NEXT: retq ; @@ -425,21 +425,21 @@ define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_0412: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_0412: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_0412: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq @@ -451,12 +451,19 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v4i32_0412: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] -; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_0412: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i32_0412: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_0412: ; AVX512VL: # %bb.0: @@ -469,21 +476,21 @@ define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_4012: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_4012: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_4012: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq @@ -672,22 +679,22 @@ define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) { ; SSE2-LABEL: shuffle_v4f32_z4zz: ; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_z4zz: ; SSE3: # %bb.0: +; SSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_z4zz: ; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSSE3-NEXT: retq ; @@ -707,26 +714,23 @@ define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) { ; SSE2-LABEL: shuffle_v4f32_zz4z: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_zz4z: ; SSE3: # %bb.0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_zz4z: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v4f32_zz4z: @@ -821,21 +825,21 @@ ; SSE2-LABEL: shuffle_v4f32_z6zz: ; SSE2: # %bb.0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_z6zz: ; SSE3: # %bb.0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_z6zz: ; SSSE3: # %bb.0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSSE3-NEXT: retq ; @@ -856,7 +860,7 @@ ; SSE2-LABEL: shuffle_v4f32_0z23: ; SSE2: # %bb.0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -864,7 +868,7 @@ ; SSE3-LABEL: shuffle_v4f32_0z23: ; SSE3: # %bb.0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq @@ -872,7 +876,7 @@ ; SSSE3-LABEL: shuffle_v4f32_0z23: ; SSSE3: # %bb.0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq @@ -1058,29 +1062,29 @@ define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: shuffle_v4f32_0zz4: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_0zz4: ; SSE3: # %bb.0: -; SSE3-NEXT: xorps %xmm2, %xmm2 -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0] -; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE3-NEXT: movaps %xmm2, %xmm0 +; SSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_0zz4: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v4f32_0zz4: @@ -1142,27 +1146,27 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: shuffle_v4f32_0z24: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_0z24: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE3-NEXT: xorps %xmm2, %xmm2 -; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] ; SSE3-NEXT: movaps %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_0z24: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] ; SSSE3-NEXT: movaps %xmm2, %xmm0 ; SSSE3-NEXT: retq @@ -1347,22 +1351,22 @@ define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) { ; SSE2-LABEL: shuffle_v4i32_z6zz: ; SSE2: # %bb.0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_z6zz: ; SSE3: # %bb.0: +; SSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_z6zz: ; SSSE3: # %bb.0: +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSSE3-NEXT: retq ; @@ -1565,13 +1569,13 @@ define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_2456: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_2456: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] ; SSE3-NEXT: retq ; @@ -2100,19 +2104,19 @@ define <4 x i32> @extract3_insert3_v4i32_0127(<4 x i32> %a0, <4 x i32> %a1) { ; SSE2-LABEL: extract3_insert3_v4i32_0127: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; ; SSE3-LABEL: extract3_insert3_v4i32_0127: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: extract3_insert3_v4i32_0127: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1343,7 +1343,7 @@ ; SSE2-LABEL: shuffle_v8i16_032dXXXX: ; SSE2: # %bb.0: ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] @@ -2949,19 +2949,19 @@ define <8 x i16> @shuffle_v8i16_9zzzuuuu(<8 x i16> %x) { ; SSE-LABEL: shuffle_v8i16_9zzzuuuu: ; SSE: # %bb.0: -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v8i16_9zzzuuuu: ; AVX1: # %bb.0: -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i16_9zzzuuuu: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: retq ; @@ -2972,7 +2972,7 @@ ; ; AVX512VL-SLOW-LABEL: shuffle_v8i16_9zzzuuuu: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX512VL-SLOW-NEXT: vbroadcastss %xmm0, %xmm0 ; AVX512VL-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512VL-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3504,7 +3504,7 @@ ; AVX2-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq @@ -3528,7 +3528,7 @@ ; XOPAVX2-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] ; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; XOPAVX2-NEXT: retq @@ -5075,8 +5075,8 @@ ; AVX2-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,u,u,16,17,24,25,18,19,26,27,20,21,28,29,22,23,u,u] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; AVX2-NEXT: retq ; @@ -5100,8 +5100,8 @@ ; XOPAVX2-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,u,u,16,17,24,25,18,19,26,27,20,21,28,29,22,23,u,u] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -5172,7 +5172,7 @@ ; AVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23] ; AVX2-NEXT: retq @@ -5197,7 +5197,7 @@ ; XOPAVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23] ; XOPAVX2-NEXT: retq @@ -5511,17 +5511,17 @@ ; AVX2-SLOW-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,u,u,16,17,20,21,18,19,22,23,24,25,28,29,26,27,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; AVX2-FAST-NEXT: retq ; @@ -5545,9 +5545,9 @@ ; XOPAVX2-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] ; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] ; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -5808,7 +5808,7 @@ ; AVX2-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7,16,17,18,19,20,21,26,27,24,25,26,27,28,29,22,23] ; AVX2-NEXT: retq @@ -5833,7 +5833,7 @@ ; XOPAVX2-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7,16,17,18,19,20,21,26,27,24,25,26,27,28,29,22,23] ; XOPAVX2-NEXT: retq @@ -7364,17 +7364,30 @@ ; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: PR34369: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,10,11,u,u,u,u,u,u,4,5] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25] -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: PR34369: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: PR34369: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,10,11,u,u,u,u,u,u,4,5] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: PR34369: ; AVX512VL: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -130,7 +130,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] ; AVX1-NEXT: retq ; @@ -368,7 +368,7 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_08991abb: ; AVX1: # %bb.0: -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] @@ -1452,7 +1452,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] ; AVX1-NEXT: retq ; @@ -1719,7 +1719,7 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_08991abb: ; AVX1: # %bb.0: -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] @@ -1757,8 +1757,8 @@ define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_091b2d3f: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -864,12 +864,19 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test15: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test15: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test15: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -2526,13 +2533,13 @@ define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_insertps4: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_insertps4: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3-NEXT: retq ; @@ -2743,7 +2750,7 @@ ; SSE2-LABEL: PR30264: ; SSE2: # %bb.0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -2751,7 +2758,7 @@ ; SSSE3-LABEL: PR30264: ; SSSE3: # %bb.0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq @@ -2991,11 +2998,9 @@ ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: psraw $8, %xmm1 -; SSSE3-NEXT: pextrw $7, %xmm1, %eax -; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: movsbl (%rsi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSSE3-NEXT: movsbl (%rdx), %eax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -1214,22 +1214,55 @@ ; define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> %y, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind { -; SSE-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32: -; SSE: # %bb.0: -; SSE-NEXT: # kill: def $ecx killed $ecx def $rcx -; SSE-NEXT: # kill: def $edx killed $edx def $rdx -; SSE-NEXT: # kill: def $edi killed $edi def $rdi -; SSE-NEXT: andl $3, %edi -; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: andl $3, %edx -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: andl $3, %ecx -; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: retq +; SSE2-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32: +; SSE2: # %bb.0: +; SSE2-NEXT: # kill: def $ecx killed $ecx def $rcx +; SSE2-NEXT: # kill: def $edx killed $edx def $rdx +; SSE2-NEXT: # kill: def $edi killed $edi def $rdi +; SSE2-NEXT: andl $3, %edi +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: andl $3, %edx +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: andl $3, %ecx +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: # kill: def $ecx killed $ecx def $rcx +; SSSE3-NEXT: # kill: def $edx killed $edx def $rdx +; SSSE3-NEXT: # kill: def $edi killed $edi def $rdi +; SSSE3-NEXT: andl $3, %edi +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: andl $3, %edx +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: andl $3, %ecx +; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32: +; SSE41: # %bb.0: +; SSE41-NEXT: # kill: def $ecx killed $ecx def $rcx +; SSE41-NEXT: # kill: def $edx killed $edx def $rdx +; SSE41-NEXT: # kill: def $edi killed $edi def $rdi +; SSE41-NEXT: andl $3, %edi +; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: andl $3, %edx +; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: andl $3, %ecx +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq ; ; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32: ; AVX: # %bb.0: @@ -1243,8 +1276,7 @@ ; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %x0 = extractelement <4 x float> %x, i32 %i0 @@ -1292,8 +1324,8 @@ ; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: @@ -1329,8 +1361,8 @@ ; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -155,7 +155,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -454,7 +454,7 @@ ; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -613,7 +613,7 @@ ; AVX1-LABEL: trunc_add_const_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -999,7 +999,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -1425,7 +1425,7 @@ ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -1914,7 +1914,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -2273,7 +2273,7 @@ ; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -2433,7 +2433,7 @@ ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2899,7 +2899,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -3286,7 +3286,7 @@ ; AVX1-LABEL: trunc_and_const_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -3662,7 +3662,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -4049,7 +4049,7 @@ ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -4425,7 +4425,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -4812,7 +4812,7 @@ ; AVX1-LABEL: trunc_or_const_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -116,12 +116,10 @@ ; ; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_lshr: @@ -339,7 +337,7 @@ ; AVX1-LABEL: trunc8i32_8i16: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2035,13 +2033,14 @@ ; AVX1-LABEL: store_merge_split: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: shlq $4, %rdi ; AVX1-NEXT: vmovdqu %xmm0, (%rsi,%rdi) diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -1754,7 +1754,7 @@ ; SSE2-LABEL: shuf_zext_16i8_to_4i64_offset11: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: psrlq $8, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE2-NEXT: movdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -569,7 +569,7 @@ ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vshift-4.ll b/llvm/test/CodeGen/X86/vshift-4.ll --- a/llvm/test/CodeGen/X86/vshift-4.ll +++ b/llvm/test/CodeGen/X86/vshift-4.ll @@ -58,19 +58,15 @@ ; X32-LABEL: shift2a: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X32-NEXT: pslld %xmm2, %xmm0 +; X32-NEXT: psrlq $32, %xmm1 +; X32-NEXT: pslld %xmm1, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: shift2a: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X64-NEXT: pslld %xmm2, %xmm0 +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: pslld %xmm1, %xmm0 ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -84,19 +80,15 @@ ; X32-LABEL: shift2b: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X32-NEXT: pslld %xmm2, %xmm0 +; X32-NEXT: psrlq $32, %xmm1 +; X32-NEXT: pslld %xmm1, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: shift2b: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X64-NEXT: pslld %xmm2, %xmm0 +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: pslld %xmm1, %xmm0 ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -110,19 +102,15 @@ ; X32-LABEL: shift2c: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X32-NEXT: pslld %xmm2, %xmm0 +; X32-NEXT: psrlq $32, %xmm1 +; X32-NEXT: pslld %xmm1, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: shift2c: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X64-NEXT: pslld %xmm2, %xmm0 +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: pslld %xmm1, %xmm0 ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -136,8 +124,7 @@ ; X32-LABEL: shift3a: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; X32-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] ; X32-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X32-NEXT: psllw %xmm1, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) @@ -145,8 +132,7 @@ ; ; X64-LABEL: shift3a: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X64-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] ; X64-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: psllw %xmm1, %xmm0 ; X64-NEXT: movdqa %xmm0, (%rdi)