Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -29401,13 +29401,13 @@ (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; - SrcVT = DstVT = MaskVT; + SrcVT = DstVT = MVT::v2f64; return true; } if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) && (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; - SrcVT = DstVT = MaskVT; + SrcVT = DstVT = MVT::v4f32; return true; } } @@ -30721,32 +30721,6 @@ return SDValue(); } - case X86ISD::MOVSD: - case X86ISD::MOVSS: { - SDValue V0 = peekThroughBitcasts(N->getOperand(0)); - SDValue V1 = peekThroughBitcasts(N->getOperand(1)); - bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode()); - bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode()); - if (isZero0 && isZero1) - return SDValue(); - - // We often lower to MOVSD/MOVSS from integer as well as native float - // types; remove unnecessary domain-crossing bitcasts if we can to make it - // easier to combine shuffles later on. We've already accounted for the - // domain switching cost when we decided to lower with it. - bool isFloat = VT.isFloatingPoint(); - bool isFloat0 = V0.getSimpleValueType().isFloatingPoint(); - bool isFloat1 = V1.getSimpleValueType().isFloatingPoint(); - if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) { - MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32) - : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32); - V0 = DAG.getBitcast(NewVT, V0); - V1 = DAG.getBitcast(NewVT, V1); - return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1)); - } - - return SDValue(); - } case X86ISD::INSERTPS: { assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); SDValue Op0 = N.getOperand(0); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -4425,14 +4425,6 @@ def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), addr:$dst), (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>; - - // Shuffle with VMOVSS - def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)), - (VMOVSSZrr (v4i32 VR128X:$src1), VR128X:$src2)>; - - // Shuffle with VMOVSD - def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)), - (VMOVSDZrr VR128X:$src1, VR128X:$src2)>; } let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -285,14 +285,6 @@ def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), addr:$dst), (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; - - // Shuffle with VMOVSS - def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), - (VMOVSSrr VR128:$src1, VR128:$src2)>; - - // Shuffle with VMOVSD - def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), - (VMOVSDrr VR128:$src1, VR128:$src2)>; } let Predicates = [UseSSE1] in { @@ -317,10 +309,6 @@ def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), addr:$dst), (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; - - // Shuffle with MOVSS - def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), - (MOVSSrr VR128:$src1, VR128:$src2)>; } let Predicates = [UseSSE2] in { @@ -333,10 +321,6 @@ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; def : Pat<(v2f64 (X86vzload addr:$src)), (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; - - // Shuffle with MOVSD - def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, VR128:$src2)>; } // Aliases to help the assembler pick two byte VEX encodings by swapping the Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -1292,46 +1292,44 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # %bb.0: -; SSE2-NEXT: movups 80(%rdi), %xmm5 -; SSE2-NEXT: movups 64(%rdi), %xmm8 +; SSE2-NEXT: movups 80(%rdi), %xmm9 +; SSE2-NEXT: movups 64(%rdi), %xmm10 ; SSE2-NEXT: movups (%rdi), %xmm0 -; SSE2-NEXT: movups 16(%rdi), %xmm6 -; SSE2-NEXT: movups 32(%rdi), %xmm2 -; SSE2-NEXT: movups 48(%rdi), %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[2,3] -; SSE2-NEXT: movaps %xmm5, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[2,3] -; SSE2-NEXT: movaps %xmm2, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,1,0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] +; SSE2-NEXT: movups 16(%rdi), %xmm11 +; SSE2-NEXT: movups 32(%rdi), %xmm8 +; SSE2-NEXT: movups 48(%rdi), %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm10[2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1] +; SSE2-NEXT: movaps %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm3[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0] +; SSE2-NEXT: movaps %xmm0, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm11[2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: movaps %xmm8, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,0] ; SSE2-NEXT: movups %xmm3, 16(%rsi) -; SSE2-NEXT: movups %xmm4, (%rsi) -; SSE2-NEXT: movups %xmm1, 16(%rdx) +; SSE2-NEXT: movups %xmm5, (%rsi) +; SSE2-NEXT: movups %xmm2, 16(%rdx) ; SSE2-NEXT: movups %xmm0, (%rdx) -; SSE2-NEXT: movupd %xmm7, 16(%rcx) -; SSE2-NEXT: movupd %xmm9, (%rcx) +; SSE2-NEXT: movups %xmm7, 16(%rcx) +; SSE2-NEXT: movups %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: Index: test/CodeGen/X86/vec_shift7.ll =================================================================== --- test/CodeGen/X86/vec_shift7.ll +++ test/CodeGen/X86/vec_shift7.ll @@ -10,7 +10,7 @@ ; X32-NEXT: movdqa %xmm0, %xmm1 ; X32-NEXT: psllq $2, %xmm1 ; X32-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; X32-NEXT: movd %xmm1, %edx ; X32-NEXT: movd %xmm0, %eax ; X32-NEXT: retl Index: test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-128.ll +++ test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1235,16 +1235,17 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psraw $4, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] ; SSE2-NEXT: psraw $2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm1, %xmm0 ; SSE2-NEXT: psraw $1, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v8i16: @@ -1322,16 +1323,17 @@ ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psraw $4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; X32-SSE-NEXT: movapd %xmm1, %xmm2 +; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] ; X32-SSE-NEXT: psraw $2, %xmm1 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] -; X32-SSE-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm0, %xmm1 +; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; X32-SSE-NEXT: movaps %xmm2, %xmm0 +; X32-SSE-NEXT: andps %xmm1, %xmm0 ; X32-SSE-NEXT: psraw $1, %xmm2 -; X32-SSE-NEXT: pandn %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: andnps %xmm2, %xmm1 +; X32-SSE-NEXT: orps %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <8 x i16> %a, ret <8 x i16> %shift Index: test/CodeGen/X86/vector-shift-lshr-128.ll =================================================================== --- test/CodeGen/X86/vector-shift-lshr-128.ll +++ test/CodeGen/X86/vector-shift-lshr-128.ll @@ -995,16 +995,17 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] ; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm1, %xmm0 ; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v8i16: @@ -1083,16 +1084,17 @@ ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; X32-SSE-NEXT: movapd %xmm1, %xmm2 +; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] ; X32-SSE-NEXT: psrlw $2, %xmm1 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] -; X32-SSE-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm0, %xmm1 +; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; X32-SSE-NEXT: movaps %xmm2, %xmm0 +; X32-SSE-NEXT: andps %xmm1, %xmm0 ; X32-SSE-NEXT: psrlw $1, %xmm2 -; X32-SSE-NEXT: pandn %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: andnps %xmm2, %xmm1 +; X32-SSE-NEXT: orps %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <8 x i16> %a, ret <8 x i16> %shift Index: test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v8.ll +++ test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1248,8 +1248,8 @@ ; SSE2-LABEL: shuffle_v8i16_032dXXXX: ; SSE2: # %bb.0: ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE2-NEXT: retq @@ -1403,8 +1403,8 @@ ; SSE2-LABEL: shuffle_v8i16_012dcde3: ; SSE2: # %bb.0: ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,1,2,0,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] @@ -1542,11 +1542,10 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_XX4X8acX: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_XX4X8acX: