Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13734,6 +13734,59 @@ return St1; } +/// Convert a disguised subvector insertion into a shuffle: +/// insert_vector_elt V, (bitcast X from vector type), IdxC --> +/// bitcast(shuffle (bitcast V), (extended X), Mask) +/// Note: We do not use an insert_subvector node because that requires a legal +/// subvector type. +static SDValue combineInsertEltToShuffle(SDNode *N, SelectionDAG &DAG) { + SDValue InsertVal = N->getOperand(1); + if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() || + !InsertVal.getOperand(0).getValueType().isVector()) + return SDValue(); + + SDValue SubVec = InsertVal.getOperand(0); + SDValue DestVec = N->getOperand(0); + EVT SubVecVT = SubVec.getValueType(); + EVT VT = DestVec.getValueType(); + unsigned NumSrcElts = SubVecVT.getVectorNumElements(); + unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits(); + unsigned NumMaskVals = ExtendRatio * NumSrcElts; + + // Step 1: Create a shuffle mask that implements this insert operation. The + // vector that we are inserting into will be operand 0 of the shuffle, so + // those elements are just 'i'. The inserted subvector is in the first + // positions of operand 1 of the shuffle. Example: + // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7} + SmallVector Mask(NumMaskVals); + assert(isa(N->getOperand(2)) && "Need constant insert index"); + unsigned InsIndex = cast(N->getOperand(2))->getZExtValue(); + for (unsigned i = 0; i != NumMaskVals; ++i) { + if (i / NumSrcElts == InsIndex) + Mask[i] = (i % NumSrcElts) + NumMaskVals; + else + Mask[i] = i; + } + + // Bail out if the target can not handle the shuffle we want to create. + EVT SubVecEltVT = SubVecVT.getVectorElementType(); + EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals); + if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, ShufVT)) + return SDValue(); + + // Step 2: Create a wide vector from the inserted source vector by appending + // undefined elements. This is the same size as our destination vector. + SDLoc DL(N); + SmallVector ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT)); + ConcatOps[0] = SubVec; + SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps); + + // Step 3: Shuffle in the padded subvector. + SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec); + SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask); + return DAG.getBitcast(VT, Shuf); +} + SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); @@ -13755,7 +13808,9 @@ // Check that we know which element is being inserted if (!isa(EltNo)) return SDValue(); - unsigned Elt = cast(EltNo)->getZExtValue(); + + if (SDValue Shuf = combineInsertEltToShuffle(N, DAG)) + return Shuf; // Canonicalize insert_vector_elt dag nodes. // Example: @@ -13764,6 +13819,7 @@ // // Do this only if the child insert_vector node has one use; also // do this only if indices are both constants and Idx1 < Idx0. + unsigned Elt = cast(EltNo)->getZExtValue(); if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() && isa(InVec.getOperand(2))) { unsigned OtherElt = InVec.getConstantOperandVal(2); Index: test/CodeGen/AArch64/arm64-neon-copy.ll =================================================================== --- test/CodeGen/AArch64/arm64-neon-copy.ll +++ test/CodeGen/AArch64/arm64-neon-copy.ll @@ -140,7 +140,7 @@ define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) { ; CHECK-LABEL: ins1f2: -; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %tmp3 = extractelement <1 x double> %tmp1, i32 0 %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1 ret <2 x double> %tmp4 Index: test/CodeGen/X86/insertelement-shuffle.ll =================================================================== --- test/CodeGen/X86/insertelement-shuffle.ll +++ test/CodeGen/X86/insertelement-shuffle.ll @@ -7,42 +7,34 @@ define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounwind { ; X32_AVX256-LABEL: insert_subvector_256: ; X32_AVX256: # BB#0: -; X32_AVX256-NEXT: pushl %eax ; X32_AVX256-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32_AVX256-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; X32_AVX256-NEXT: vmovd %xmm1, (%esp) -; X32_AVX256-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3] -; X32_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; X32_AVX256-NEXT: popl %eax +; X32_AVX256-NEXT: vpbroadcastd %xmm1, %xmm1 +; X32_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X32_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_256: ; X64_AVX256: # BB#0: ; X64_AVX256-NEXT: vmovd %edi, %xmm1 ; X64_AVX256-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 -; X64_AVX256-NEXT: vmovd %xmm1, -{{[0-9]+}}(%rsp) -; X64_AVX256-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3] -; X64_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X64_AVX256-NEXT: vpbroadcastd %xmm1, %xmm1 +; X64_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X64_AVX256-NEXT: retq ; ; X32_AVX512-LABEL: insert_subvector_256: ; X32_AVX512: # BB#0: -; X32_AVX512-NEXT: pushl %eax ; X32_AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32_AVX512-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; X32_AVX512-NEXT: vmovd %xmm1, (%esp) -; X32_AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3] -; X32_AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; X32_AVX512-NEXT: popl %eax +; X32_AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 +; X32_AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X32_AVX512-NEXT: retl ; ; X64_AVX512-LABEL: insert_subvector_256: ; X64_AVX512: # BB#0: ; X64_AVX512-NEXT: vmovd %edi, %xmm1 ; X64_AVX512-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 -; X64_AVX512-NEXT: vmovd %xmm1, -{{[0-9]+}}(%rsp) -; X64_AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3] -; X64_AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X64_AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 +; X64_AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X64_AVX512-NEXT: retq %ins1 = insertelement <2 x i16> undef, i16 %x0, i32 0 %ins2 = insertelement <2 x i16> %ins1, i16 %x1, i32 1 @@ -80,28 +72,17 @@ ; ; X32_AVX512-LABEL: insert_subvector_512: ; X32_AVX512: # BB#0: -; X32_AVX512-NEXT: pushl %ebp -; X32_AVX512-NEXT: movl %esp, %ebp -; X32_AVX512-NEXT: andl $-8, %esp -; X32_AVX512-NEXT: subl $8, %esp -; X32_AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X32_AVX512-NEXT: vmovlps %xmm1, (%esp) -; X32_AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X32_AVX512-NEXT: vpinsrd $0, (%esp), %xmm1, %xmm1 -; X32_AVX512-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; X32_AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 -; X32_AVX512-NEXT: movl %ebp, %esp -; X32_AVX512-NEXT: popl %ebp +; X32_AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X32_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,0,8,0,3,0,4,0,5,0,6,0,7,0] +; X32_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; X32_AVX512-NEXT: retl ; ; X64_AVX512-LABEL: insert_subvector_512: ; X64_AVX512: # BB#0: ; X64_AVX512-NEXT: vmovd %edi, %xmm1 ; X64_AVX512-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 -; X64_AVX512-NEXT: vmovq %xmm1, %rax -; X64_AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64_AVX512-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1 -; X64_AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; X64_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,3,4,5,6,7] +; X64_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; X64_AVX512-NEXT: retq %ins1 = insertelement <2 x i32> undef, i32 %x0, i32 0 %ins2 = insertelement <2 x i32> %ins1, i32 %x1, i32 1 @@ -144,22 +125,8 @@ ; ; X32_AVX512-LABEL: insert_subvector_into_undef: ; X32_AVX512: # BB#0: -; X32_AVX512-NEXT: pushl %ebp -; X32_AVX512-NEXT: movl %esp, %ebp -; X32_AVX512-NEXT: andl $-8, %esp -; X32_AVX512-NEXT: subl $8, %esp ; X32_AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32_AVX512-NEXT: vmovlps %xmm0, (%esp) -; X32_AVX512-NEXT: movl (%esp), %eax -; X32_AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32_AVX512-NEXT: vmovd %eax, %xmm0 -; X32_AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; X32_AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X32_AVX512-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; X32_AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X32_AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X32_AVX512-NEXT: movl %ebp, %esp -; X32_AVX512-NEXT: popl %ebp +; X32_AVX512-NEXT: vbroadcastsd %xmm0, %zmm0 ; X32_AVX512-NEXT: retl ; ; X64_AVX512-LABEL: insert_subvector_into_undef: