Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -2782,6 +2782,14 @@ return false; } + /// Return true if it is profitable to transform an insert of a subvector into + /// a shuffle. The type is the subvector type that is being bitcasted into a + /// scalar to allow insert element into a larger vector. + virtual bool shouldConvertInsSubVecToShuffle(EVT VT) const { + assert(VT.isVector() && "Unexpected type for shuffle transform"); + return false; + } + /// Return true if the target has native support for the specified value type /// and it is 'desirable' to use the type for the given node type. e.g. On x86 /// i16 is legal, but undesirable since i16 instruction encodings are longer Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13734,6 +13734,59 @@ return St1; } +/// Convert a disguised subvector insertion into a shuffle: +/// insert_vector_elt V, (bitcast X from vector type), IdxC --> +/// shuffle V, (extended and bitcast X), Mask +/// Note: We do not use an insert_subvector node because that requires a legal +/// subvector type. +static SDValue combineInsertEltToShuffle(SDNode *N, SelectionDAG &DAG) { + SDValue InsertVal = N->getOperand(1); + if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() || + !InsertVal.getOperand(0).getValueType().isVector()) + return SDValue(); + + // Check if the target wants to use a shuffle for this data type. + SDValue SubVec = InsertVal.getOperand(0); + EVT SubVecVT = SubVec.getValueType(); + if (!DAG.getTargetLoweringInfo().shouldConvertInsSubVecToShuffle(SubVecVT)) + return SDValue(); + + // Step 1: Create a wide vector from the inserted source vector by appending + // undefined elements. This will be the same size as our destination vector. + SDValue DestVec = N->getOperand(0); + EVT VT = DestVec.getValueType(); + unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits(); + SmallVector ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT)); + ConcatOps[0] = SubVec; + + SDLoc DL(N); + EVT SubVecEltVT = SubVecVT.getVectorElementType(); + unsigned NumSrcElts = SubVecVT.getVectorNumElements(); + unsigned NumMaskVals = ExtendRatio * NumSrcElts; + EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals); + SDValue PaddedSubVec = DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, + ConcatVT, ConcatOps)); + + // Step 2: Create a shuffle mask that implements this insert operation. The + // vector that we are inserting into will be operand 0 of the shuffle, so + // those elements are just 'i'. The inserted subvector is the 0-element of + // operand 1 of the shuffle. Example: + // insert v4i32 V, (v2i16 X), 2 --> shuffle v4i32 V, X', {0, 1, 4, 3} + unsigned NumDestElts = VT.getVectorNumElements(); + SmallVector ShufMaskVals(NumDestElts); + assert(isa(N->getOperand(2)) && "Need constant insert index"); + unsigned InsIndex = cast(N->getOperand(2))->getZExtValue(); + for (unsigned i = 0; i != NumDestElts; ++i) { + if (i == InsIndex) + ShufMaskVals[i] = NumDestElts; + else + ShufMaskVals[i] = i; + } + + // Step 3: Shuffle in the padded subvector. + return DAG.getVectorShuffle(VT, DL, DestVec, PaddedSubVec, ShufMaskVals); +} + SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); @@ -13755,7 +13808,9 @@ // Check that we know which element is being inserted if (!isa(EltNo)) return SDValue(); - unsigned Elt = cast(EltNo)->getZExtValue(); + + if (SDValue Shuf = combineInsertEltToShuffle(N, DAG)) + return Shuf; // Canonicalize insert_vector_elt dag nodes. // Example: @@ -13764,6 +13819,7 @@ // // Do this only if the child insert_vector node has one use; also // do this only if indices are both constants and Idx1 < Idx0. + unsigned Elt = cast(EltNo)->getZExtValue(); if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() && isa(InVec.getOperand(2))) { unsigned OtherElt = InVec.getConstantOperandVal(2); Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -751,6 +751,10 @@ bool isDesirableToCombineBuildVectorToShuffleTruncate( ArrayRef ShuffleMask, EVT SrcVT, EVT TruncVT) const override; + bool shouldConvertInsSubVecToShuffle(EVT VT) const override { + return VT.getVectorElementType() != MVT::i1; + } + /// Return true if the target has native support for /// the specified value type and it is 'desirable' to use the type for the /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 Index: test/CodeGen/X86/insertelement-shuffle.ll =================================================================== --- test/CodeGen/X86/insertelement-shuffle.ll +++ test/CodeGen/X86/insertelement-shuffle.ll @@ -7,42 +7,34 @@ define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounwind { ; X32_AVX256-LABEL: insert_subvector_256: ; X32_AVX256: # BB#0: -; X32_AVX256-NEXT: pushl %eax ; X32_AVX256-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32_AVX256-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; X32_AVX256-NEXT: vmovd %xmm1, (%esp) -; X32_AVX256-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3] -; X32_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; X32_AVX256-NEXT: popl %eax +; X32_AVX256-NEXT: vpbroadcastd %xmm1, %xmm1 +; X32_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X32_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_256: ; X64_AVX256: # BB#0: ; X64_AVX256-NEXT: vmovd %edi, %xmm1 ; X64_AVX256-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 -; X64_AVX256-NEXT: vmovd %xmm1, -{{[0-9]+}}(%rsp) -; X64_AVX256-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3] -; X64_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X64_AVX256-NEXT: vpbroadcastd %xmm1, %xmm1 +; X64_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X64_AVX256-NEXT: retq ; ; X32_AVX512-LABEL: insert_subvector_256: ; X32_AVX512: # BB#0: -; X32_AVX512-NEXT: pushl %eax ; X32_AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32_AVX512-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; X32_AVX512-NEXT: vmovd %xmm1, (%esp) -; X32_AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3] -; X32_AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; X32_AVX512-NEXT: popl %eax +; X32_AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 +; X32_AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X32_AVX512-NEXT: retl ; ; X64_AVX512-LABEL: insert_subvector_256: ; X64_AVX512: # BB#0: ; X64_AVX512-NEXT: vmovd %edi, %xmm1 ; X64_AVX512-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 -; X64_AVX512-NEXT: vmovd %xmm1, -{{[0-9]+}}(%rsp) -; X64_AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3] -; X64_AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X64_AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 +; X64_AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X64_AVX512-NEXT: retq %ins1 = insertelement <2 x i16> undef, i16 %x0, i32 0 %ins2 = insertelement <2 x i16> %ins1, i16 %x1, i32 1 @@ -54,54 +46,32 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind { ; X32_AVX256-LABEL: insert_subvector_512: ; X32_AVX256: # BB#0: -; X32_AVX256-NEXT: pushl %ebp -; X32_AVX256-NEXT: movl %esp, %ebp -; X32_AVX256-NEXT: andl $-8, %esp -; X32_AVX256-NEXT: subl $8, %esp ; X32_AVX256-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X32_AVX256-NEXT: vmovlps %xmm2, (%esp) -; X32_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2 -; X32_AVX256-NEXT: vpinsrd $0, (%esp), %xmm2, %xmm2 -; X32_AVX256-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; X32_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; X32_AVX256-NEXT: movl %ebp, %esp -; X32_AVX256-NEXT: popl %ebp +; X32_AVX256-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; X32_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; X32_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_512: ; X64_AVX256: # BB#0: ; X64_AVX256-NEXT: vmovd %edi, %xmm2 ; X64_AVX256-NEXT: vpinsrd $1, %esi, %xmm2, %xmm2 -; X64_AVX256-NEXT: vmovq %xmm2, %rax -; X64_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2 -; X64_AVX256-NEXT: vpinsrq $0, %rax, %xmm2, %xmm2 -; X64_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X64_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; X64_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; X64_AVX256-NEXT: retq ; ; X32_AVX512-LABEL: insert_subvector_512: ; X32_AVX512: # BB#0: -; X32_AVX512-NEXT: pushl %ebp -; X32_AVX512-NEXT: movl %esp, %ebp -; X32_AVX512-NEXT: andl $-8, %esp -; X32_AVX512-NEXT: subl $8, %esp -; X32_AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X32_AVX512-NEXT: vmovlps %xmm1, (%esp) -; X32_AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X32_AVX512-NEXT: vpinsrd $0, (%esp), %xmm1, %xmm1 -; X32_AVX512-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; X32_AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 -; X32_AVX512-NEXT: movl %ebp, %esp -; X32_AVX512-NEXT: popl %ebp +; X32_AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X32_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,0,8,0,3,0,4,0,5,0,6,0,7,0] +; X32_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; X32_AVX512-NEXT: retl ; ; X64_AVX512-LABEL: insert_subvector_512: ; X64_AVX512: # BB#0: ; X64_AVX512-NEXT: vmovd %edi, %xmm1 ; X64_AVX512-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 -; X64_AVX512-NEXT: vmovq %xmm1, %rax -; X64_AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64_AVX512-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1 -; X64_AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; X64_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,3,4,5,6,7] +; X64_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; X64_AVX512-NEXT: retq %ins1 = insertelement <2 x i32> undef, i32 %x0, i32 0 %ins2 = insertelement <2 x i32> %ins1, i32 %x1, i32 1 @@ -116,22 +86,9 @@ define <8 x i64> @insert_subvector_into_undef(i32 %x0, i32 %x1) nounwind { ; X32_AVX256-LABEL: insert_subvector_into_undef: ; X32_AVX256: # BB#0: -; X32_AVX256-NEXT: pushl %ebp -; X32_AVX256-NEXT: movl %esp, %ebp -; X32_AVX256-NEXT: andl $-8, %esp -; X32_AVX256-NEXT: subl $8, %esp ; X32_AVX256-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32_AVX256-NEXT: vmovlps %xmm0, (%esp) -; X32_AVX256-NEXT: movl (%esp), %eax -; X32_AVX256-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32_AVX256-NEXT: vmovd %eax, %xmm0 -; X32_AVX256-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; X32_AVX256-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X32_AVX256-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; X32_AVX256-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X32_AVX256-NEXT: vmovdqa %ymm0, %ymm1 -; X32_AVX256-NEXT: movl %ebp, %esp -; X32_AVX256-NEXT: popl %ebp +; X32_AVX256-NEXT: vbroadcastsd %xmm0, %ymm0 +; X32_AVX256-NEXT: vmovaps %ymm0, %ymm1 ; X32_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_into_undef: @@ -144,22 +101,8 @@ ; ; X32_AVX512-LABEL: insert_subvector_into_undef: ; X32_AVX512: # BB#0: -; X32_AVX512-NEXT: pushl %ebp -; X32_AVX512-NEXT: movl %esp, %ebp -; X32_AVX512-NEXT: andl $-8, %esp -; X32_AVX512-NEXT: subl $8, %esp ; X32_AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32_AVX512-NEXT: vmovlps %xmm0, (%esp) -; X32_AVX512-NEXT: movl (%esp), %eax -; X32_AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32_AVX512-NEXT: vmovd %eax, %xmm0 -; X32_AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; X32_AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X32_AVX512-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; X32_AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X32_AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X32_AVX512-NEXT: movl %ebp, %esp -; X32_AVX512-NEXT: popl %ebp +; X32_AVX512-NEXT: vbroadcastsd %xmm0, %zmm0 ; X32_AVX512-NEXT: retl ; ; X64_AVX512-LABEL: insert_subvector_into_undef: