Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15726,6 +15726,78 @@ NewMask); } +/// If the shuffle mask is taking exactly one element from the first vector +/// operand and passing through all other elements from the second vector +/// operand, return the index of the mask element that is choosing an element +/// from the first operand. Otherwise, return -1. +static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef Mask) { + int MaskSize = Mask.size(); + int EltFromOp0 = -1; + for (int i = 0; i != MaskSize; ++i) { + if (Mask[i] >= 0 && Mask[i] < MaskSize) { + // We're looking for a shuffle of exactly one element from operand 0. + if (EltFromOp0 != -1) + return -1; + EltFromOp0 = i; + } else if (Mask[i] != i + MaskSize) { + // Nothing from operand 1 can change lanes. + return -1; + } + } + return EltFromOp0; +} + +/// If a shuffle inserts exactly one element from a source vector operand into +/// another vector operand and we can access the specified element as a scalar, +/// then we can eliminate the shuffle. +static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, + SelectionDAG &DAG) { + // First, check if we are taking one element of a vector and shuffling that + // element into another vector. + ArrayRef Mask = Shuf->getMask(); + SDValue Op0 = Shuf->getOperand(0); + SDValue Op1 = Shuf->getOperand(1); + int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask); + if (ShufOp0Index == -1) { + // Commute mask and check again. + SmallVector CommutedMask(Mask.begin(), Mask.end()); + ShuffleVectorSDNode::commuteMask(CommutedMask); + ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask); + if (ShufOp0Index == -1) + return SDValue(); + // Commute operands to match the commuted shuffle mask. + std::swap(Op0, Op1); + Mask = CommutedMask; + } + + // The shuffle inserts exactly one element from operand 0 into operand 1. + // Now see if we can access that element as a scalar via a real insert element + // instruction. + assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() && + "Shuffle mask value must be from operand 0"); + if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT) + return SDValue(); + + auto *InsIndexC = dyn_cast(Op0.getOperand(2)); + if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index]) + return SDValue(); + + // There's an existing insertelement with constant insertion index, so we + // don't need to check the legality/profitability of a replacement operation + // that differs at most in the constant value. The target should be able to + // lower any of those in a similar way. If not, legalization will expand this + // to a scalar-to-vector plus shuffle. + // + // Note that the shuffle may move the scalar from the position that the insert + // element used. Therefore, our new insert element occurs at the shuffle's + // mask index value, not the insert's index value. + // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C' + SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf), + Op0.getOperand(2).getValueType()); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), + Op1, Op0.getOperand(1), NewInsIndex); +} + SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -15776,6 +15848,9 @@ if (SDValue V = simplifyShuffleMask(SVN, N0, N1, DAG)) return V; + if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG)) + return InsElt; + // A shuffle of a single vector that is a splat can always be folded. if (auto *N0Shuf = dyn_cast(N0)) if (N1->isUndef() && N0Shuf->isSplat()) Index: test/CodeGen/X86/shuffle-of-insert.ll =================================================================== --- test/CodeGen/X86/shuffle-of-insert.ll +++ test/CodeGen/X86/shuffle-of-insert.ll @@ -13,14 +13,13 @@ ; ; SSE4-LABEL: ins_elt_0: ; SSE4: # BB#0: -; SSE4-NEXT: pinsrd $0, %edi, %xmm0 -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE4-NEXT: pinsrd $0, %edi, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX-LABEL: ins_elt_0: ; AVX: # BB#0: -; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpinsrd $0, %edi, %xmm1, %xmm0 ; AVX-NEXT: retq %ins = insertelement <4 x i32> %v1, i32 %x, i32 0 %shuf = shufflevector <4 x i32> %ins, <4 x i32> %v2, <4 x i32> @@ -30,23 +29,20 @@ define <4 x i32> @ins_elt_1(i32 %x, <4 x i32> %v1, <4 x i32> %v2) { ; SSE2-LABEL: ins_elt_1: ; SSE2: # BB#0: -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE4-LABEL: ins_elt_1: ; SSE4: # BB#0: -; SSE4-NEXT: pinsrd $1, %edi, %xmm0 -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; SSE4-NEXT: pinsrd $1, %edi, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX-LABEL: ins_elt_1: ; AVX: # BB#0: -; AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpinsrd $1, %edi, %xmm1, %xmm0 ; AVX-NEXT: retq %ins = insertelement <4 x i32> %v1, i32 %x, i32 1 %shuf = shufflevector <4 x i32> %ins, <4 x i32> %v2, <4 x i32> @@ -58,24 +54,21 @@ define <4 x i32> @ins_elt_2_commute(i32 %x, <4 x i32> %v1, <4 x i32> %v2) { ; SSE2-LABEL: ins_elt_2_commute: ; SSE2: # BB#0: -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: ins_elt_2_commute: ; SSE4: # BB#0: -; SSE4-NEXT: pinsrd $2, %edi, %xmm0 -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; SSE4-NEXT: pinsrd $2, %edi, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX-LABEL: ins_elt_2_commute: ; AVX: # BB#0: -; AVX-NEXT: vpinsrd $2, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; AVX-NEXT: vpinsrd $2, %edi, %xmm1, %xmm0 ; AVX-NEXT: retq %ins = insertelement <4 x i32> %v1, i32 %x, i32 2 %shuf = shufflevector <4 x i32> %v2, <4 x i32> %ins, <4 x i32> @@ -85,24 +78,21 @@ define <4 x i32> @ins_elt_3_commute(i32 %x, <4 x i32> %v1, <4 x i32> %v2) { ; SSE2-LABEL: ins_elt_3_commute: ; SSE2: # BB#0: -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: ins_elt_3_commute: ; SSE4: # BB#0: -; SSE4-NEXT: pinsrd $3, %edi, %xmm0 -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; SSE4-NEXT: pinsrd $3, %edi, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX-LABEL: ins_elt_3_commute: ; AVX: # BB#0: -; AVX-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX-NEXT: vpinsrd $3, %edi, %xmm1, %xmm0 ; AVX-NEXT: retq %ins = insertelement <4 x i32> %v1, i32 %x, i32 3 %shuf = shufflevector <4 x i32> %v2, <4 x i32> %ins, <4 x i32> @@ -122,16 +112,13 @@ ; ; SSE4-LABEL: ins_elt_0_to_2: ; SSE4: # BB#0: -; SSE4-NEXT: pinsrd $0, %edi, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; SSE4-NEXT: pinsrd $2, %edi, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX-LABEL: ins_elt_0_to_2: ; AVX: # BB#0: -; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; AVX-NEXT: vpinsrd $2, %edi, %xmm1, %xmm0 ; AVX-NEXT: retq %ins = insertelement <4 x i32> %v1, i32 %x, i32 0 %shuf = shufflevector <4 x i32> %ins, <4 x i32> %v2, <4 x i32> @@ -148,16 +135,13 @@ ; ; SSE4-LABEL: ins_elt_1_to_0: ; SSE4: # BB#0: -; SSE4-NEXT: pinsrd $1, %edi, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE4-NEXT: pinsrd $0, %edi, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX-LABEL: ins_elt_1_to_0: ; AVX: # BB#0: -; AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpinsrd $0, %edi, %xmm1, %xmm0 ; AVX-NEXT: retq %ins = insertelement <4 x i32> %v1, i32 %x, i32 1 %shuf = shufflevector <4 x i32> %ins, <4 x i32> %v2, <4 x i32> @@ -167,26 +151,21 @@ define <4 x i32> @ins_elt_2_to_3(i32 %x, <4 x i32> %v1, <4 x i32> %v2) { ; SSE2-LABEL: ins_elt_2_to_3: ; SSE2: # BB#0: -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0] +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: ins_elt_2_to_3: ; SSE4: # BB#0: -; SSE4-NEXT: pinsrd $2, %edi, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; SSE4-NEXT: pinsrd $3, %edi, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX-LABEL: ins_elt_2_to_3: ; AVX: # BB#0: -; AVX-NEXT: vpinsrd $2, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX-NEXT: vpinsrd $3, %edi, %xmm1, %xmm0 ; AVX-NEXT: retq %ins = insertelement <4 x i32> %v1, i32 %x, i32 2 %shuf = shufflevector <4 x i32> %v2, <4 x i32> %ins, <4 x i32> @@ -196,25 +175,20 @@ define <4 x i32> @ins_elt_3_to_1(i32 %x, <4 x i32> %v1, <4 x i32> %v2) { ; SSE2-LABEL: ins_elt_3_to_1: ; SSE2: # BB#0: -; SSE2-NEXT: movd %edi, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE4-LABEL: ins_elt_3_to_1: ; SSE4: # BB#0: -; SSE4-NEXT: pinsrd $3, %edi, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; SSE4-NEXT: pinsrd $1, %edi, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX-LABEL: ins_elt_3_to_1: ; AVX: # BB#0: -; AVX-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpinsrd $1, %edi, %xmm1, %xmm0 ; AVX-NEXT: retq %ins = insertelement <4 x i32> %v1, i32 %x, i32 3 %shuf = shufflevector <4 x i32> %v2, <4 x i32> %ins, <4 x i32> Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -391,16 +391,12 @@ ; X32-LABEL: insertps_from_load_ins_elt_undef_i32: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; X32-NEXT: pinsrd $2, (%eax), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_load_ins_elt_undef_i32: ; X64: ## BB#0: -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; X64-NEXT: pinsrd $2, (%rdi), %xmm0 ; X64-NEXT: retq %1 = load i32, i32* %b, align 4 %2 = insertelement <4 x i32> undef, i32 %1, i32 0 Index: test/CodeGen/X86/vector-shuffle-128-v2.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v2.ll +++ test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1063,27 +1063,13 @@ ; ; SSE41-LABEL: insert_reg_lo_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: movq %rdi, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pinsrq $0, %rdi, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: insert_reg_lo_v2i64: -; AVX1: # BB#0: -; AVX1-NEXT: vmovq %rdi, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: insert_reg_lo_v2i64: -; AVX2: # BB#0: -; AVX2-NEXT: vmovq %rdi, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: insert_reg_lo_v2i64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovq %rdi, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512VL-NEXT: retq +; AVX-LABEL: insert_reg_lo_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq %v = insertelement <2 x i64> undef, i64 %a, i32 0 %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -1107,27 +1093,13 @@ ; ; SSE41-LABEL: insert_mem_lo_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pinsrq $0, (%rdi), %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: insert_mem_lo_v2i64: -; AVX1: # BB#0: -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: insert_mem_lo_v2i64: -; AVX2: # BB#0: -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: insert_mem_lo_v2i64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512VL-NEXT: retq +; AVX-LABEL: insert_mem_lo_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vpinsrq $0, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq %a = load i64, i64* %ptr %v = insertelement <2 x i64> undef, i64 %a, i32 0 %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> @@ -1135,16 +1107,32 @@ } define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) { -; SSE-LABEL: insert_reg_hi_v2i64: -; SSE: # BB#0: -; SSE-NEXT: movq %rdi, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: retq +; SSE2-LABEL: insert_reg_hi_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movq %rdi, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_reg_hi_v2i64: +; SSE3: # BB#0: +; SSE3-NEXT: movq %rdi, %xmm1 +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_reg_hi_v2i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movq %rdi, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_reg_hi_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: pinsrq $1, %rdi, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: insert_reg_hi_v2i64: ; AVX: # BB#0: -; AVX-NEXT: vmovq %rdi, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <2 x i64> undef, i64 %a, i32 0 %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> @@ -1152,16 +1140,32 @@ } define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) { -; SSE-LABEL: insert_mem_hi_v2i64: -; SSE: # BB#0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: retq +; SSE2-LABEL: insert_mem_hi_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_mem_hi_v2i64: +; SSE3: # BB#0: +; SSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_mem_hi_v2i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_mem_hi_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: pinsrq $1, (%rdi), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: insert_mem_hi_v2i64: ; AVX: # BB#0: -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %a = load i64, i64* %ptr %v = insertelement <2 x i64> undef, i64 %a, i32 0 Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -2759,21 +2759,15 @@ ; ; SSE41-LABEL: combine_constant_insertion_v4i32: ; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = +; SSE41-NEXT: pinsrd $0, %edi, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: combine_constant_insertion_v4i32: -; AVX1: # BB#0: -; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_constant_insertion_v4i32: -; AVX2: # BB#0: -; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] -; AVX2-NEXT: retq +; AVX-LABEL: combine_constant_insertion_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 +; AVX-NEXT: retq %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 %ret = shufflevector <4 x i32> %a0, <4 x i32> , <4 x i32> ret <4 x i32> %ret