Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -8576,9 +8576,9 @@ // If we are inserting one variable into a vector of non-zero constants, try // to avoid loading each constant element as a scalar. Load the constants as a // vector and then insert the variable scalar element. If insertion is not - // supported, we assume that we will fall back to a shuffle to get the scalar - // blended with the constants. Insertion into a zero vector is handled as a - // special-case somewhere below here. + // supported, fall back to a shuffle to get the scalar blended with the + // constants. Insertion into a zero vector is handled as a special-case + // somewhere below here. if (NumConstants == NumElems - 1 && NumNonZero != 1 && (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) || isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) { @@ -8616,7 +8616,21 @@ MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF); SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI); - return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); + unsigned InsertC = cast(InsIndex)->getZExtValue(); + unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits(); + if (InsertC < NumEltsInLow128Bits) + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); + + // There's no good way to insert into the high elements of a >128-bit + // vector, so use shuffles to avoid an extract/insert sequence. + assert(VT.getSizeInBits() > 128 && "Invalid insertion index?"); + assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector"); + SmallVector ShuffleMask; + unsigned NumElts = VT.getVectorNumElements(); + for (unsigned i = 0; i != NumElts; ++i) + ShuffleMask.push_back(i == InsertC ? NumElts : i); + SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt); + return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask); } // Special case for single non-zero, non-undef, element. Index: llvm/trunk/test/CodeGen/X86/insert-into-constant-vector.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/insert-into-constant-vector.ll +++ llvm/trunk/test/CodeGen/X86/insert-into-constant-vector.ll @@ -273,50 +273,48 @@ ; ; X32AVX1-LABEL: elt7_v8i32: ; X32AVX1: # %bb.0: -; X32AVX1-NEXT: vmovaps {{.*#+}} ymm0 = <42,1,2,3,4,5,6,u> -; X32AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X32AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; X32AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; X32AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; X32AVX1-NEXT: retl ; ; X64AVX1-LABEL: elt7_v8i32: ; X64AVX1: # %bb.0: -; X64AVX1-NEXT: vmovaps {{.*#+}} ymm0 = <42,1,2,3,4,5,6,u> -; X64AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64AVX1-NEXT: vpinsrd $3, %edi, %xmm1, %xmm1 -; X64AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64AVX1-NEXT: vmovd %edi, %xmm0 +; X64AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; X64AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; X64AVX1-NEXT: retq ; ; X32AVX2-LABEL: elt7_v8i32: ; X32AVX2: # %bb.0: -; X32AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <42,1,2,3,4,5,6,u> -; X32AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X32AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; X32AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X32AVX2-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0 +; X32AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32AVX2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; X32AVX2-NEXT: retl ; ; X64AVX2-LABEL: elt7_v8i32: ; X64AVX2: # %bb.0: -; X64AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <42,1,2,3,4,5,6,u> -; X64AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64AVX2-NEXT: vpinsrd $3, %edi, %xmm1, %xmm1 -; X64AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X64AVX2-NEXT: vmovd %edi, %xmm0 +; X64AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; X64AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; X64AVX2-NEXT: retq ; ; X32AVX512F-LABEL: elt7_v8i32: ; X32AVX512F: # %bb.0: -; X32AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <42,1,2,3,4,5,6,u> -; X32AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X32AVX512F-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; X32AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X32AVX512F-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0 +; X32AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32AVX512F-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; X32AVX512F-NEXT: retl ; ; X64AVX512F-LABEL: elt7_v8i32: ; X64AVX512F: # %bb.0: -; X64AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <42,1,2,3,4,5,6,u> -; X64AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64AVX512F-NEXT: vpinsrd $3, %edi, %xmm1, %xmm1 -; X64AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X64AVX512F-NEXT: vmovd %edi, %xmm0 +; X64AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 +; X64AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; X64AVX512F-NEXT: retq %ins = insertelement <8 x i32> , i32 %x, i32 7 ret <8 x i32> %ins @@ -354,21 +352,46 @@ ; X64SSE4-NEXT: movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0] ; X64SSE4-NEXT: retq ; -; X32AVX-LABEL: elt6_v8f32: -; X32AVX: # %bb.0: -; X32AVX-NEXT: vmovaps {{.*#+}} ymm0 = <4.2E+1,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,u,7.0E+0> -; X32AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X32AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; X32AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X32AVX-NEXT: retl +; X32AVX1-LABEL: elt6_v8f32: +; X32AVX1: # %bb.0: +; X32AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X32AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] +; X32AVX1-NEXT: retl ; -; X64AVX-LABEL: elt6_v8f32: -; X64AVX: # %bb.0: -; X64AVX-NEXT: vmovaps {{.*#+}} ymm1 = <4.2E+1,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,u,7.0E+0> -; X64AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; X64AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] -; X64AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X64AVX-NEXT: retq +; X64AVX1-LABEL: elt6_v8f32: +; X64AVX1: # %bb.0: +; X64AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X64AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] +; X64AVX1-NEXT: retq +; +; X32AVX2-LABEL: elt6_v8f32: +; X32AVX2: # %bb.0: +; X32AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; X32AVX2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] +; X32AVX2-NEXT: retl +; +; X64AVX2-LABEL: elt6_v8f32: +; X64AVX2: # %bb.0: +; X64AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; X64AVX2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] +; X64AVX2-NEXT: retq +; +; X32AVX512F-LABEL: elt6_v8f32: +; X32AVX512F: # %bb.0: +; X32AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32AVX512F-NEXT: vbroadcastsd %xmm0, %ymm0 +; X32AVX512F-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] +; X32AVX512F-NEXT: retl +; +; X64AVX512F-LABEL: elt6_v8f32: +; X64AVX512F: # %bb.0: +; X64AVX512F-NEXT: vbroadcastsd %xmm0, %ymm0 +; X64AVX512F-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] +; X64AVX512F-NEXT: retq %ins = insertelement <8 x float> , float %x, i32 6 ret <8 x float> %ins } @@ -453,10 +476,10 @@ ; ; X64AVX512F-LABEL: elt5_v8i64: ; X64AVX512F: # %bb.0: +; X64AVX512F-NEXT: vmovq %rdi, %xmm1 +; X64AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,8,6,7] ; X64AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <42,1,2,3,4,u,6,7> -; X64AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; X64AVX512F-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 -; X64AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; X64AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; X64AVX512F-NEXT: retq %ins = insertelement <8 x i64> , i64 %x, i32 5 ret <8 x i64> %ins