Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5396,6 +5396,57 @@ return V; } +/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. +static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, + unsigned NonZeros, unsigned NumNonZero, + unsigned NumZero, SelectionDAG &DAG, + const X86Subtarget *Subtarget, + const TargetLowering &TLI) { + // We know there's at least one non-zero element + unsigned FirstNonZeroIdx = 0; + SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx); + while (FirstNonZero.getOpcode() == ISD::UNDEF || + X86::isZeroNode(FirstNonZero)) { + ++FirstNonZeroIdx; + FirstNonZero = Op->getOperand(FirstNonZeroIdx); + } + + if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue V = FirstNonZero.getOperand(0); + unsigned CorrectIdx = cast(FirstNonZero.getOperand(1)) + ->getZExtValue() == FirstNonZeroIdx; + + for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) { + SDValue Elem = Op.getOperand(Idx); + if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem)) + continue; + + // TODO: What else can be here? Deal with it. + if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // TODO: Some optimizations are still possible here + // ex: Getting one element from a vector, and the rest from another. + if (Elem.getOperand(0) != V) + return SDValue(); + + if (cast(Elem.getOperand(1))->getZExtValue() == Idx) + ++CorrectIdx; + } + + if (NumNonZero != CorrectIdx) + return SDValue(); + + // We're copying a vector and setting some values to 0 + SDLoc dl(Op); + EVT VT = Op.getSimpleValueType(); + SDValue InsertpsMask = DAG.getIntPtrConstant( + FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4 | (~NonZeros & 0xf)); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask); +} + /// getVShift - Return a vector logical shift node. /// static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, @@ -6147,6 +6198,14 @@ if (V.getNode()) return V; } + // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS + if (EVTBits == 32 && NumElems == 4) { + SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero, + NumZero, DAG, Subtarget, *this); + if (V.getNode()) + return V; + } + // If element VT is == 32 bits, turn it into a number of shuffles. SmallVector V(NumElems); if (NumElems == 4 && NumZero > 0) { Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -320,3 +320,152 @@ %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> ret <4 x i32> %result } + +;;;;;;; Shuffles optimizable with a single insertps instruction +define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_XYZ0: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $8 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %x, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecext3 = extractelement <4 x float> %x, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 + %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 + ret <4 x float> %vecinit5 +} + +define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_XY00: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $12 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %x, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_X00A: +; CHECK-NOT: movaps +; CHECK-NOT: shufps +; CHECK: insertps $48 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 + %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_X00X: +; CHECK-NOT: movaps +; CHECK-NOT: shufps +; CHECK: insertps $48 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 + %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_X0YC: +; CHECK: shufps +; CHECK-NOT: movhlps +; CHECK-NOT: shufps +; CHECK: insertps $176 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 + %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> + %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> + ret <4 x float> %vecinit5 +} + +define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_XYZ0: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $8 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <4 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecext3 = extractelement <4 x i32> %x, i32 2 + %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2 + %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 + ret <4 x i32> %vecinit5 +} + +define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_XY00: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $12 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <4 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 + %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_X00A: +; CHECK-NOT: movaps +; CHECK-NOT: shufps +; CHECK: insertps $48 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 + %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_X00X: +; CHECK-NOT: movaps +; CHECK-NOT: shufps +; CHECK: insertps $48 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 + %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_X0YC: +; CHECK: shufps +; CHECK-NOT: movhlps +; CHECK-NOT: shufps +; CHECK: insertps $176 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 + %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> + %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> + ret <4 x i32> %vecinit5 +}