Index: lib/Target/X86/InstPrinter/X86InstComments.cpp =================================================================== --- lib/Target/X86/InstPrinter/X86InstComments.cpp +++ lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -203,6 +203,24 @@ DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask); break; + case X86::VMOVDDUPYrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VMOVDDUPYrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVDDUPMask(MVT::v4f64, ShuffleMask); + break; + + case X86::MOVDDUPrr: + case X86::VMOVDDUPrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVDDUPrm: + case X86::VMOVDDUPrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVDDUPMask(MVT::v2f64, ShuffleMask); + break; + case X86::PSLLDQri: case X86::VPSLLDQri: Src1Name = getRegName(MI->getOperand(1).getReg()); Index: lib/Target/X86/Utils/X86ShuffleDecode.h =================================================================== --- lib/Target/X86/Utils/X86ShuffleDecode.h +++ lib/Target/X86/Utils/X86ShuffleDecode.h @@ -40,6 +40,8 @@ void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl &ShuffleMask); +void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl &ShuffleMask); + void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask); void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask); Index: lib/Target/X86/Utils/X86ShuffleDecode.cpp =================================================================== --- lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -79,6 +79,20 @@ } } +void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl &ShuffleMask) { + unsigned VectorSizeInBits = VT.getSizeInBits(); + unsigned ScalarSizeInBits = VT.getScalarSizeInBits(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + unsigned NumLaneSubElts = 64 / ScalarSizeInBits; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; i += NumLaneSubElts) + for (unsigned s = 0; s != NumLaneSubElts; s++) + ShuffleMask.push_back(l + s); +} + void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { unsigned VectorSizeInBits = VT.getSizeInBits(); unsigned NumElts = VectorSizeInBits / 8; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5501,11 +5501,16 @@ break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); + IsUnary = true; break; case X86ISD::MOVSHDUP: DecodeMOVSHDUPMask(VT, Mask); + IsUnary = true; break; case X86ISD::MOVDDUP: + DecodeMOVDDUPMask(VT, Mask); + IsUnary = true; + break; case X86ISD::MOVLHPD: case X86ISD::MOVLPD: case X86ISD::MOVLPS: @@ -8254,6 +8259,11 @@ assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { + // Use low duplicate instructions for masks that match their pattern. + if (Subtarget->hasSSE3()) + if (isShuffleEquivalent(Mask, 0, 0)) + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1); + // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); @@ -8511,6 +8521,14 @@ Mask, Subtarget, DAG)) return Broadcast; + // Use even/odd duplicate instructions for masks that match their pattern. + if (Subtarget->hasSSE3()) { + if (isShuffleEquivalent(Mask, 0, 0, 2, 2)) + return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); + if (isShuffleEquivalent(Mask, 1, 1, 3, 3)) + return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); + } + if (Subtarget->hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. @@ -10236,6 +10254,10 @@ Mask, Subtarget, DAG)) return Broadcast; + // Use low duplicate instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 0, 2, 2)) + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); + if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { // Non-half-crossing single input shuffles can be lowerid with an // interleaved permutation. @@ -10419,6 +10441,13 @@ if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"); + + // Use even/odd duplicate instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6)) + return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); + if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7)) + return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); + if (isSingleInputShuffleMask(Mask)) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); @@ -22808,7 +22837,8 @@ : InVec.getOperand(1); // If inputs to shuffle are the same for both ops, then allow 2 uses - unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; + unsigned AllowedUses = InVec.getNumOperands() > 1 && + InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; if (LdNode.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -1333,7 +1333,7 @@ (VMOVHPSrm VR128:$src1, addr:$src2)>; // VMOVHPD patterns - + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem // is during lowering, where it's not possible to recognize the load fold // cause it has two uses through a bitcast. One use disappears at isel time @@ -2743,24 +2743,6 @@ (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; } -let Predicates = [HasAVX] in { - // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the - // problem is during lowering, where it's not possible to recognize the load - // fold cause it has two uses through a bitcast. One use disappears at isel - // time and the fold opportunity reappears. - def : Pat<(v2f64 (X86Movddup VR128:$src)), - (VUNPCKLPDrr VR128:$src, VR128:$src)>; -} - -let Predicates = [UseSSE2] in { - // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the - // problem is during lowering, where it's not possible to recognize the load - // fold cause it has two uses through a bitcast. One use disappears at isel - // time and the fold opportunity reappears. - def : Pat<(v2f64 (X86Movddup VR128:$src)), - (UNPCKLPDrr VR128:$src, VR128:$src)>; -} - //===----------------------------------------------------------------------===// // SSE 1 & 2 - Extract Floating-Point Sign mask //===----------------------------------------------------------------------===// @@ -5388,10 +5370,10 @@ //===---------------------------------------------------------------------===// multiclass sse3_replicate_dfp { -let hasSideEffects = 0 in def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [], IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; + [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))], + IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, @@ -8130,21 +8112,21 @@ (loadv2f64 (add addr:$src, (iPTR 16))), (iPTR 2)), (VMOVUPDYrm addr:$src)>; - + def : Pat<(insert_subvector (v32i8 (insert_subvector undef, (bc_v16i8 (loadv2i64 addr:$src)), (iPTR 0))), (bc_v16i8 (loadv2i64 (add addr:$src, (iPTR 16)))), (iPTR 16)), (VMOVDQUYrm addr:$src)>; - + def : Pat<(insert_subvector (v16i16 (insert_subvector undef, (bc_v8i16 (loadv2i64 addr:$src)), (iPTR 0))), (bc_v8i16 (loadv2i64 (add addr:$src, (iPTR 16)))), (iPTR 8)), (VMOVDQUYrm addr:$src)>; - + def : Pat<(insert_subvector (v8i32 (insert_subvector undef, (bc_v4i32 (loadv2i64 addr:$src)), (iPTR 0))), Index: test/CodeGen/X86/avx-splat.ll =================================================================== --- test/CodeGen/X86/avx-splat.ll +++ test/CodeGen/X86/avx-splat.ll @@ -18,7 +18,7 @@ } ; CHECK: vmovq -; CHECK-NEXT: vunpcklpd %xmm +; CHECK-NEXT: vmovddup %xmm ; CHECK-NEXT: vinsertf128 $1 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp { entry: @@ -29,7 +29,7 @@ ret <4 x i64> %vecinit6.i } -; CHECK: vunpcklpd %xmm +; CHECK: vmovddup %xmm ; CHECK-NEXT: vinsertf128 $1 define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp { entry: @@ -42,7 +42,7 @@ ; Test this turns into a broadcast: ; shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> -; +; ; CHECK: vbroadcastss define <8 x float> @funcE() nounwind { allocas: Index: test/CodeGen/X86/avx2-vbroadcast.ll =================================================================== --- test/CodeGen/X86/avx2-vbroadcast.ll +++ test/CodeGen/X86/avx2-vbroadcast.ll @@ -317,7 +317,7 @@ } ;CHECK-LABEL: _inreg2xdouble: -;CHECK: vunpcklpd +;CHECK: vmovddup ;CHECK: ret define <2 x double> @_inreg2xdouble(<2 x double> %a) { %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer Index: test/CodeGen/X86/sincos-opt.ll =================================================================== --- test/CodeGen/X86/sincos-opt.ll +++ test/CodeGen/X86/sincos-opt.ll @@ -15,9 +15,8 @@ ; OSX_SINCOS-LABEL: test1: ; OSX_SINCOS: callq ___sincosf_stret -; OSX_SINCOS: movaps %xmm0, %xmm1 -; OSX_SINCOS: shufps {{.*}} ## xmm1 = xmm1[1,1,2,3] -; OSX_SINCOS: addss %xmm0, %xmm1 +; OSX_SINCOS: movshdup {{.*}} xmm1 = xmm0[1,1,3,3] +; OSX_SINCOS: addss %xmm1, %xmm0 ; OSX_NOOPT: test1 ; OSX_NOOPT: callq _sinf Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -291,22 +291,20 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X32-LABEL: buildvector: ; X32: ## BB#0: ## %entry -; X32-NEXT: movaps %xmm0, %xmm2 -; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; X32-NEXT: addss %xmm1, %xmm0 -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X32-NEXT: addss %xmm2, %xmm1 -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X32-NEXT: addss %xmm2, %xmm3 +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X32-NEXT: retl ; ; X64-LABEL: buildvector: ; X64: ## BB#0: ## %entry -; X64-NEXT: movaps %xmm0, %xmm2 -; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; X64-NEXT: addss %xmm1, %xmm0 -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X64-NEXT: addss %xmm2, %xmm1 -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X64-NEXT: addss %xmm2, %xmm3 +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X64-NEXT: retq entry: %tmp7 = extractelement <2 x float> %A, i32 0 Index: test/CodeGen/X86/sse_partial_update.ll =================================================================== --- test/CodeGen/X86/sse_partial_update.ll +++ test/CodeGen/X86/sse_partial_update.ll @@ -12,7 +12,7 @@ ; CHECK-LABEL: rsqrtss: ; CHECK: rsqrtss %xmm0, %xmm0 ; CHECK-NEXT: cvtss2sd %xmm0 -; CHECK-NEXT: shufps +; CHECK-NEXT: movshdup ; CHECK-NEXT: cvtss2sd %xmm0 ; CHECK-NEXT: movap ; CHECK-NEXT: jmp @@ -33,7 +33,7 @@ ; CHECK-LABEL: rcpss: ; CHECK: rcpss %xmm0, %xmm0 ; CHECK-NEXT: cvtss2sd %xmm0 -; CHECK-NEXT: shufps +; CHECK-NEXT: movshdup ; CHECK-NEXT: cvtss2sd %xmm0 ; CHECK-NEXT: movap ; CHECK-NEXT: jmp @@ -53,7 +53,7 @@ ; CHECK-LABEL: sqrtss: ; CHECK: sqrtss %xmm0, %xmm0 ; CHECK-NEXT: cvtss2sd %xmm0 -; CHECK-NEXT: shufps +; CHECK-NEXT: movshdup ; CHECK-NEXT: cvtss2sd %xmm0 ; CHECK-NEXT: movap ; CHECK-NEXT: jmp Index: test/CodeGen/X86/v2f32.ll =================================================================== --- test/CodeGen/X86/v2f32.ll +++ test/CodeGen/X86/v2f32.ll @@ -5,8 +5,7 @@ define void @test1(<2 x float> %Q, float *%P2) nounwind { ; X64-LABEL: test1: ; X64: # BB#0: -; X64-NEXT: movaps %xmm0, %xmm1 -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X64-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: addss %xmm0, %xmm1 ; X64-NEXT: movss %xmm1, (%rdi) ; X64-NEXT: retq @@ -14,8 +13,7 @@ ; X32-LABEL: test1: ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movaps %xmm0, %xmm1 -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X32-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X32-NEXT: addss %xmm0, %xmm1 ; X32-NEXT: movss %xmm1, (%eax) ; X32-NEXT: retl Index: test/CodeGen/X86/vec_cast2.ll =================================================================== --- test/CodeGen/X86/vec_cast2.ll +++ test/CodeGen/X86/vec_cast2.ll @@ -118,7 +118,7 @@ ; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx ; CHECK-WIDE-NEXT: movzbl %cl, %ecx ; CHECK-WIDE-NEXT: orl %eax, %ecx -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax ; CHECK-WIDE-NEXT: shll $8, %eax ; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx @@ -127,7 +127,7 @@ ; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 ; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 ; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax ; CHECK-WIDE-NEXT: shll $8, %eax ; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx @@ -163,7 +163,7 @@ ; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx ; CHECK-WIDE-NEXT: movzbl %cl, %ecx ; CHECK-WIDE-NEXT: orl %eax, %ecx -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax ; CHECK-WIDE-NEXT: shll $8, %eax ; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx Index: test/CodeGen/X86/vector-shuffle-128-v2.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v2.ll +++ test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -105,22 +105,22 @@ ; ; SSE3-LABEL: shuffle_v2f64_00: ; SSE3: # BB#0: -; SSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2f64_00: ; SSSE3: # BB#0: -; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2f64_00: ; SSE41: # BB#0: -; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_00: ; AVX: # BB#0: -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle @@ -160,25 +160,22 @@ ; ; SSE3-LABEL: shuffle_v2f64_22: ; SSE3: # BB#0: -; SSE3-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] -; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2f64_22: ; SSSE3: # BB#0: -; SSSE3-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2f64_22: ; SSE41: # BB#0: -; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_22: ; AVX: # BB#0: -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle @@ -1064,22 +1061,22 @@ ; ; SSE3-LABEL: insert_dup_reg_v2f64: ; SSE3: # BB#0: -; SSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_dup_reg_v2f64: ; SSSE3: # BB#0: -; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_dup_reg_v2f64: ; SSE41: # BB#0: -; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: insert_dup_reg_v2f64: ; AVX: # BB#0: -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %v = insertelement <2 x double> undef, double %a, i32 0 %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -6,7 +6,7 @@ define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_0000: ; AVX1: # BB#0: -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -21,7 +21,7 @@ define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_0001: ; AVX1: # BB#0: -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -38,7 +38,7 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -70,7 +70,7 @@ ; AVX1-LABEL: shuffle_v4f64_1000: ; AVX1: # BB#0: ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -86,7 +86,7 @@ ; AVX1-LABEL: shuffle_v4f64_2200: ; AVX1: # BB#0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4f64_2200: @@ -141,7 +141,7 @@ define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_0022: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -186,7 +186,7 @@ define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_0423: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] ; AVX1-NEXT: retq ; @@ -202,8 +202,8 @@ define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_0462: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -361,7 +361,7 @@ define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0000: ; AVX1: # BB#0: -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -376,7 +376,7 @@ define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0001: ; AVX1: # BB#0: -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -393,7 +393,7 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -441,7 +441,7 @@ ; AVX1-LABEL: shuffle_v4i64_1000: ; AVX1: # BB#0: ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -457,7 +457,7 @@ ; AVX1-LABEL: shuffle_v4i64_2200: ; AVX1: # BB#0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_2200: @@ -503,7 +503,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0124: ; AVX1: # BB#0: -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-NEXT: retq @@ -541,7 +541,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] ; AVX1-NEXT: retq ; @@ -560,7 +560,7 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] ; AVX1-NEXT: retq @@ -875,7 +875,7 @@ define <4 x double> @splat_v4f64(<2 x double> %r) { ; AVX1-LABEL: splat_v4f64: ; AVX1: # BB#0: -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -148,7 +148,7 @@ define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_01014545: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -202,7 +202,7 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: retq @@ -336,7 +336,7 @@ define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_09ab1def: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-NEXT: retq @@ -426,7 +426,7 @@ define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_00224466: ; ALL: # BB#0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; ALL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -444,7 +444,7 @@ define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_11335577: ; ALL: # BB#0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; ALL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -940,7 +940,7 @@ define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_01014545: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_01014545: @@ -1004,7 +1004,7 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: retq @@ -1175,7 +1175,7 @@ define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_09ab1def: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-NEXT: retq @@ -1305,7 +1305,7 @@ define <8 x i32> @shuffle_v8i32_00224466(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00224466: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; AVX1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_00224466: @@ -1333,7 +1333,7 @@ define <8 x i32> @shuffle_v8i32_11335577(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_11335577: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_11335577: Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -407,9 +407,9 @@ define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00224466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -562,7 +562,7 @@ define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00226644: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 @@ -618,7 +618,7 @@ define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_002u6u44: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 @@ -676,7 +676,7 @@ ; ALL-LABEL: shuffle_v8f64_uuu3uu66: ; ALL: # BB#0: ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -708,7 +708,7 @@ ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 ; ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[0,1,1,3] ; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] -; ALL-NEXT: vpermilpd {{.*#+}} ymm4 = ymm1[0,0,2,2] +; ALL-NEXT: vmovddup {{.*#+}} ymm4 = ymm1[0,0,2,2] ; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3] ; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1] ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -1644,19 +1644,17 @@ ; ; SSSE3-LABEL: combine_test2b: ; SSSE3: # BB#0: -; SSSE3-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test2b: ; SSE41: # BB#0: -; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test2b: ; AVX: # BB#0: -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -2181,17 +2179,17 @@ ; ; SSSE3-LABEL: combine_undef_input_test7: ; SSSE3: # BB#0: -; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_undef_input_test7: ; SSE41: # BB#0: -; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test7: ; AVX: # BB#0: -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> @@ -2206,17 +2204,17 @@ ; ; SSSE3-LABEL: combine_undef_input_test8: ; SSSE3: # BB#0: -; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_undef_input_test8: ; SSE41: # BB#0: -; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test8: ; AVX: # BB#0: -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> @@ -2372,17 +2370,17 @@ ; ; SSSE3-LABEL: combine_undef_input_test17: ; SSSE3: # BB#0: -; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_undef_input_test17: ; SSE41: # BB#0: -; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test17: ; AVX: # BB#0: -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> @@ -2397,17 +2395,17 @@ ; ; SSSE3-LABEL: combine_undef_input_test18: ; SSSE3: # BB#0: -; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_undef_input_test18: ; SSE41: # BB#0: -; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test18: ; AVX: # BB#0: -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32>