Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5397,6 +5397,74 @@ return V; } +/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. +static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, + unsigned NonZeros, unsigned NumNonZero, + unsigned NumZero, SelectionDAG &DAG, + const X86Subtarget *Subtarget, + const TargetLowering &TLI) { + // We know there's at least one non-zero element + unsigned FirstNonZeroIdx = 0; + SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx); + while (FirstNonZero.getOpcode() == ISD::UNDEF || + X86::isZeroNode(FirstNonZero)) { + ++FirstNonZeroIdx; + FirstNonZero = Op->getOperand(FirstNonZeroIdx); + } + + if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa(FirstNonZero.getOperand(1))) + return SDValue(); + + SDValue V = FirstNonZero.getOperand(0); + unsigned FirstNonZeroDst = cast(FirstNonZero.getOperand(1))->getZExtValue(); + unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx; + unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx; + unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst; + + for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) { + SDValue Elem = Op.getOperand(Idx); + if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem)) + continue; + + // TODO: What else can be here? Deal with it. + if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // TODO: Some optimizations are still possible here + // ex: Getting one element from a vector, and the rest from another. + if (Elem.getOperand(0) != V) + return SDValue(); + + unsigned Dst = cast(Elem.getOperand(1))->getZExtValue(); + if (Dst == Idx) + ++CorrectIdx; + else if (IncorrectIdx == -1U) { + IncorrectIdx = Idx; + IncorrectDst = Dst; + } else + // There was already one element with an incorrect index. + // We can't optimize this case to an insertps. + return SDValue(); + } + + if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) { + SDLoc dl(Op); + EVT VT = Op.getSimpleValueType(); + unsigned ElementMoveMask = 0; + if (IncorrectIdx == -1U) + ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4; + else + ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4; + + SDValue InsertpsMask = DAG.getIntPtrConstant( + ElementMoveMask | (~NonZeros & 0xf)); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask); + } + + return SDValue(); +} + /// getVShift - Return a vector logical shift node. /// static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, @@ -6148,6 +6216,14 @@ if (V.getNode()) return V; } + // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS + if (EVTBits == 32 && NumElems == 4) { + SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero, + NumZero, DAG, Subtarget, *this); + if (V.getNode()) + return V; + } + // If element VT is == 32 bits, turn it into a number of shuffles. SmallVector V(NumElems); if (NumElems == 4 && NumZero > 0) { Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -320,3 +320,259 @@ %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> ret <4 x i32> %result } + +;;;;;; Shuffles optimizable with a single insertps instruction +define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_XYZ0: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $8 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %x, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecext3 = extractelement <4 x float> %x, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 + %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 + ret <4 x float> %vecinit5 +} + +define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_XY00: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $12 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %x, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_XYY0: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $104 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %x, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2 + %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 + ret <4 x float> %vecinit5 +} + +define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_XYW0: +; CHECK: insertps $232 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %x, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecext2 = extractelement <4 x float> %x, i32 3 + %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_W00W: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $198 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 3 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_X00A: +; CHECK-NOT: movaps +; CHECK-NOT: shufps +; CHECK: insertps $48 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 + %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_X00X: +; CHECK-NOT: movaps +; CHECK-NOT: shufps +; CHECK: insertps $48 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 + %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_X0YC: +; CHECK: shufps +; CHECK-NOT: movhlps +; CHECK-NOT: shufps +; CHECK: insertps $176 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 + %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> + %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> + ret <4 x float> %vecinit5 +} + +define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_XYZ0: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $8 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <4 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecext3 = extractelement <4 x i32> %x, i32 2 + %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2 + %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 + ret <4 x i32> %vecinit5 +} + +define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_XY00: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $12 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <4 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 + %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_XYY0: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $104 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <4 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2 + %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 + ret <4 x i32> %vecinit5 +} + +define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_XYW0: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $232 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <4 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecext2 = extractelement <4 x i32> %x, i32 3 + %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2 + %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_W00W: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $198 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 3 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1 + %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 + %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3 + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_X00A: +; CHECK-NOT: movaps +; CHECK-NOT: shufps +; CHECK: insertps $48 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 + %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_X00X: +; CHECK-NOT: movaps +; CHECK-NOT: shufps +; CHECK: insertps $48 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 + %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_X0YC: +; CHECK: shufps +; CHECK-NOT: movhlps +; CHECK-NOT: shufps +; CHECK: insertps $176 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 + %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> + %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> + ret <4 x i32> %vecinit5 +} + +;; Test for a bug in the first implementation of LowerBuildVectorv4x32 +define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { +; CHECK-LABEL: test_insertps_no_undef: +; CHECK: movaps %xmm0, %xmm1 +; CHECK-NEXT: insertps $8, %xmm1, %xmm1 +; CHECK-NEXT: maxps %xmm1, %xmm0 +; CHECK-NEXT: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %x, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecext3 = extractelement <4 x float> %x, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 + %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 + %mask = fcmp olt <4 x float> %vecinit5, %x + %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5 + ret <4 x float> %res +}