Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -29401,13 +29401,13 @@ (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; - SrcVT = DstVT = MaskVT; + SrcVT = DstVT = MVT::v2f64; return true; } if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) && (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; - SrcVT = DstVT = MaskVT; + SrcVT = DstVT = MVT::v4f32; return true; } } @@ -30696,32 +30696,6 @@ } return SDValue(); } - case X86ISD::MOVSD: - case X86ISD::MOVSS: { - SDValue V0 = peekThroughBitcasts(N->getOperand(0)); - SDValue V1 = peekThroughBitcasts(N->getOperand(1)); - bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode()); - bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode()); - if (isZero0 && isZero1) - return SDValue(); - - // We often lower to MOVSD/MOVSS from integer as well as native float - // types; remove unnecessary domain-crossing bitcasts if we can to make it - // easier to combine shuffles later on. We've already accounted for the - // domain switching cost when we decided to lower with it. - bool isFloat = VT.isFloatingPoint(); - bool isFloat0 = V0.getSimpleValueType().isFloatingPoint(); - bool isFloat1 = V1.getSimpleValueType().isFloatingPoint(); - if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) { - MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32) - : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32); - V0 = DAG.getBitcast(NewVT, V0); - V1 = DAG.getBitcast(NewVT, V1); - return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1)); - } - - return SDValue(); - } case X86ISD::INSERTPS: { assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); SDValue Op0 = N.getOperand(0); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -4468,16 +4468,6 @@ (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>; } -let Predicates = [HasAVX512, OptForSize] in { - // Shuffle with VMOVSS - def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)), - (VMOVSSZrr (v4i32 VR128X:$src1), VR128X:$src2)>; - - // Shuffle with VMOVSD - def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)), - (VMOVSDZrr VR128X:$src1, VR128X:$src2)>; -} - let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src), Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -281,6 +281,8 @@ def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>; +def SDTShuff2OpFP : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>, + SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>; def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisFP<0>, SDTCisInt<2>, @@ -368,11 +370,11 @@ def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>; def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>; -def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>; -def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>; +def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2OpFP>; +def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2OpFP>; -def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>; -def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; +def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2OpFP>; +def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2OpFP>; def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>, SDTCisVec<1>, SDTCisInt<1>, Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -317,14 +317,6 @@ (v2i64 (VMOVSDrr (v2i64 (V_SET0)), (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), sub_xmm)>; - - // Shuffle with VMOVSS - def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), - (VMOVSSrr VR128:$src1, VR128:$src2)>; - - // Shuffle with VMOVSD - def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), - (VMOVSDrr VR128:$src1, VR128:$src2)>; } let Predicates = [UseSSE1] in { @@ -335,9 +327,6 @@ (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; - // Shuffle with MOVSS - def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), - (MOVSSrr VR128:$src1, VR128:$src2)>; } // MOVSSrm already zeros the high parts of the register. @@ -364,12 +353,6 @@ (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; def : Pat<(v2f64 (X86vzload addr:$src)), (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; - - let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { - // Shuffle with MOVSD - def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, VR128:$src2)>; - } } // Aliases to help the assembler pick two byte VEX encodings by swapping the @@ -6427,12 +6410,6 @@ (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; - def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), - (VPBLENDWrri VR128:$src1, VR128:$src2, (i8 3))>; - def : Pat<(v4i32 (X86Movss VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))), - (VPBLENDWrmi VR128:$src1, addr:$src2, (i8 3))>; - def : Pat<(v4i32 (X86Movss (bc_v4i32 (loadv2i64 addr:$src2)), VR128:$src1)), - (VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xfc))>; def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; @@ -6440,12 +6417,6 @@ (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; - def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), - (VPBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>; - def : Pat<(v2i64 (X86Movsd VR128:$src1, (loadv2i64 addr:$src2))), - (VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf))>; - def : Pat<(v2i64 (X86Movsd (loadv2i64 addr:$src2), VR128:$src1)), - (VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf0))>; // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), @@ -6487,12 +6458,6 @@ (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; - def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), - (PBLENDWrri VR128:$src1, VR128:$src2, (i8 3))>; - def : Pat<(v4i32 (X86Movss VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))), - (PBLENDWrmi VR128:$src1, addr:$src2, (i8 3))>; - def : Pat<(v4i32 (X86Movss (bc_v4i32 (memopv2i64 addr:$src2)), VR128:$src1)), - (PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xfc))>; def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; @@ -6500,12 +6465,6 @@ (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; - def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), - (PBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>; - def : Pat<(v2i64 (X86Movsd VR128:$src1, (memopv2i64 addr:$src2))), - (PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf))>; - def : Pat<(v2i64 (X86Movsd (memopv2i64 addr:$src2), VR128:$src1)), - (PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf0))>; } Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -1277,46 +1277,44 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # %bb.0: -; SSE2-NEXT: movups 80(%rdi), %xmm5 -; SSE2-NEXT: movups 64(%rdi), %xmm8 +; SSE2-NEXT: movups 80(%rdi), %xmm9 +; SSE2-NEXT: movups 64(%rdi), %xmm10 ; SSE2-NEXT: movups (%rdi), %xmm0 -; SSE2-NEXT: movups 16(%rdi), %xmm6 -; SSE2-NEXT: movups 32(%rdi), %xmm2 -; SSE2-NEXT: movups 48(%rdi), %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[2,3] -; SSE2-NEXT: movaps %xmm5, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[2,3] -; SSE2-NEXT: movaps %xmm2, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,1,0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] +; SSE2-NEXT: movups 16(%rdi), %xmm11 +; SSE2-NEXT: movups 32(%rdi), %xmm8 +; SSE2-NEXT: movups 48(%rdi), %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm10[2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1] +; SSE2-NEXT: movaps %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm3[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0] +; SSE2-NEXT: movaps %xmm0, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm11[2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: movaps %xmm8, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,0] ; SSE2-NEXT: movups %xmm3, 16(%rsi) -; SSE2-NEXT: movups %xmm4, (%rsi) -; SSE2-NEXT: movups %xmm1, 16(%rdx) +; SSE2-NEXT: movups %xmm5, (%rsi) +; SSE2-NEXT: movups %xmm2, 16(%rdx) ; SSE2-NEXT: movups %xmm0, (%rdx) -; SSE2-NEXT: movupd %xmm7, 16(%rcx) -; SSE2-NEXT: movupd %xmm9, (%rcx) +; SSE2-NEXT: movups %xmm7, 16(%rcx) +; SSE2-NEXT: movups %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: Index: test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-128.ll +++ test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1235,16 +1235,17 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psraw $4, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] ; SSE2-NEXT: psraw $2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm1, %xmm0 ; SSE2-NEXT: psraw $1, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v8i16: @@ -1322,16 +1323,17 @@ ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psraw $4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; X32-SSE-NEXT: movapd %xmm1, %xmm2 +; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] ; X32-SSE-NEXT: psraw $2, %xmm1 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] -; X32-SSE-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm0, %xmm1 +; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; X32-SSE-NEXT: movaps %xmm2, %xmm0 +; X32-SSE-NEXT: andps %xmm1, %xmm0 ; X32-SSE-NEXT: psraw $1, %xmm2 -; X32-SSE-NEXT: pandn %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: andnps %xmm2, %xmm1 +; X32-SSE-NEXT: orps %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <8 x i16> %a, ret <8 x i16> %shift Index: test/CodeGen/X86/vector-shift-lshr-128.ll =================================================================== --- test/CodeGen/X86/vector-shift-lshr-128.ll +++ test/CodeGen/X86/vector-shift-lshr-128.ll @@ -995,16 +995,17 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] ; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm1, %xmm0 ; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v8i16: @@ -1083,16 +1084,17 @@ ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; X32-SSE-NEXT: movapd %xmm1, %xmm2 +; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] ; X32-SSE-NEXT: psrlw $2, %xmm1 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] -; X32-SSE-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm0, %xmm1 +; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; X32-SSE-NEXT: movaps %xmm2, %xmm0 +; X32-SSE-NEXT: andps %xmm1, %xmm0 ; X32-SSE-NEXT: psrlw $1, %xmm2 -; X32-SSE-NEXT: pandn %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: andnps %xmm2, %xmm1 +; X32-SSE-NEXT: orps %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <8 x i16> %a, ret <8 x i16> %shift Index: test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v8.ll +++ test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1248,8 +1248,8 @@ ; SSE2-LABEL: shuffle_v8i16_032dXXXX: ; SSE2: # %bb.0: ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE2-NEXT: retq @@ -1403,8 +1403,8 @@ ; SSE2-LABEL: shuffle_v8i16_012dcde3: ; SSE2: # %bb.0: ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,1,2,0,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] @@ -1542,11 +1542,10 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_XX4X8acX: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_XX4X8acX: