Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -3925,6 +3925,29 @@ return true; } +/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to INSERTPS. +/// i. e: If all but one element come from the same vector. +static bool isINSERTPSMask(ArrayRef Mask, MVT VT) { + // TODO: Deal with AVX's VINSERTPS + if (!VT.is128BitVector() || VT != MVT::v4f32) + return false; + + unsigned CorrectPosV1 = 0; + unsigned CorrectPosV2 = 0; + for (int i = 0; i < (int)VT.getVectorNumElements(); ++i) + if (Mask[i] == i) + ++CorrectPosV1; + else if (Mask[i] == i + 4) + ++CorrectPosV2; + + if (CorrectPosV1 == 3 || CorrectPosV2 == 3) + // We have 3 elements from one vector, and one from another. + return true; + + return false; +} + // // Some special combinations that can be optimized. // @@ -7257,6 +7280,67 @@ getShuffleSHUFImmediate(SVOp), DAG); } +static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, + SelectionDAG &DAG, bool HasAVX) { + // Generate an insertps instruction when inserting an f32 from memory onto a + // v4f32 or when copying a member from one v4f32 to another. + // TODO: Optimize for AVX cases too (VINSERTPS) + MVT VT = SVOp->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + auto Mask = SVOp->getMask(); + assert(VT == MVT::v4f32 && "unsupported vector type for insertps"); + + int FromV1 = std::count_if(Mask.begin(), Mask.end(), + [](const int &i) { return i < 4; }); + + SDValue From; + SDValue To; + unsigned DestIndex; + if (FromV1 == 1) { + From = V1; + To = V2; + DestIndex = std::find_if(Mask.begin(), Mask.end(), + [](const int &i) { return i < 4; }) - + Mask.begin(); + } else { + From = V2; + To = V1; + DestIndex = std::find_if(Mask.begin(), Mask.end(), + [](const int &i) { return i >= 4; }) - + Mask.begin(); + } + + if (MayFoldLoad(From)) { + // Trivial case, when From comes from a load and is only used by the + // shuffle. Make it use insertps from the vector that we need from that + // load. + SDValue Addr = From.getOperand(1); + SDValue NewAddr = + DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr, + DAG.getConstant(DestIndex * EVT.getStoreSize(), + Addr.getSimpleValueType())); + + LoadSDNode *Load = cast(From); + SDValue Ld = DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Load->getMemOperand(), 0, EVT.getStoreSize())); + + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Ld); + SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector, + InsertpsMask); + } + + // Vector-element-to-vector + unsigned SrcIndex = Mask[DestIndex] % 4; + SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); +} + // Reduce a vector shuffle to zext. static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { @@ -7416,6 +7500,7 @@ bool V1IsSplat = false; bool V2IsSplat = false; bool HasSSE2 = Subtarget->hasSSE2(); + bool HasSSE4 = Subtarget->hasSSE41(); bool HasFp256 = Subtarget->hasFp256(); bool HasInt256 = Subtarget->hasInt256(); MachineFunction &MF = DAG.getMachineFunction(); @@ -7641,6 +7726,9 @@ if (isUNPCKH_v_undef_Mask(M, VT, HasInt256)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); + if (HasSSE4 && isINSERTPSMask(M, VT)) + return getINSERTPS(SVOp, dl, DAG, Subtarget->hasAVX()); + //===--------------------------------------------------------------------===// // Generate target specific nodes for 128 or 256-bit shuffles only // supported in the AVX instruction set. Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -249,3 +249,31 @@ ; X64: ret } +define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +entry: + %0 = load <4 x float>* %pb, align 16 + %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> + ret <4 x float> %vecinit6 +; X32-LABEL: insertps_from_shufflevector_1: +; X32-NOT: shufps +; X32: insertps +; X32: ret +; X64-LABEL: insertps_from_shufflevector_1: +; X64-NOT: shufps +; X64: insertps +; X64: ret +} + +define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) { +entry: + %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + ret <4 x float> %vecinit6 +; X32-LABEL: insertps_from_shufflevector_2: +; X32: insertps +; X32-NOT: shufps +; X32: ret +; X64-LABEL: insertps_from_shufflevector_2: +; X64: insertps +; X64-NOT: shufps +; X64: ret +}