Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -3931,6 +3931,29 @@ return true; } +/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to INSERTPS. +/// i. e: If all but one element come from the same vector. +static bool isINSERTPSMask(ArrayRef Mask, MVT VT) { + // TODO: Deal with AVX's VINSERTPS + if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32)) + return false; + + unsigned CorrectPosV1 = 0; + unsigned CorrectPosV2 = 0; + for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) + if (Mask[i] == i) + ++CorrectPosV1; + else if (Mask[i] == i + 4) + ++CorrectPosV2; + + if (CorrectPosV1 == 3 || CorrectPosV2 == 3) + // We have 3 elements from one vector, and one from another. + return true; + + return false; +} + // // Some special combinations that can be optimized. // @@ -7263,6 +7286,84 @@ getShuffleSHUFImmediate(SVOp), DAG); } +// It is only safe to call this function if isINSERTPSMask is true for +// this shufflevector mask. +static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, + SelectionDAG &DAG) { + // Generate an insertps instruction when inserting an f32 from memory onto a + // v4f32 or when copying a member from one v4f32 to another. + // We also use it for transferring i32 from one register to another, + // since it simply copies the same bits. + // If we're transfering an i32 from memory to a specific element in a + // register, we output a generic DAG that will match the PINSRD + // instruction. + // TODO: Optimize for AVX cases too (VINSERTPS) + MVT VT = SVOp->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + auto Mask = SVOp->getMask(); + assert((VT == MVT::v4f32 || VT == MVT::v4i32) && + "unsupported vector type for insertps/pinsrd"); + + int FromV1 = std::count_if(Mask.begin(), Mask.end(), + [](const int &i) { return i < 4; }); + + SDValue From; + SDValue To; + unsigned DestIndex; + if (FromV1 == 1) { + From = V1; + To = V2; + DestIndex = std::find_if(Mask.begin(), Mask.end(), + [](const int &i) { return i < 4; }) - + Mask.begin(); + } else { + From = V2; + To = V1; + DestIndex = std::find_if(Mask.begin(), Mask.end(), + [](const int &i) { return i >= 4; }) - + Mask.begin(); + } + + if (MayFoldLoad(From)) { + // Trivial case, when From comes from a load and is only used by the + // shuffle. Make it use insertps from the vector that we need from that + // load. + SDValue Addr = From.getOperand(1); + SDValue NewAddr = + DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr, + DAG.getConstant(DestIndex * EVT.getStoreSize(), + Addr.getSimpleValueType())); + + LoadSDNode *Load = cast(From); + SDValue NewLoad = + DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Load->getMemOperand(), 0, EVT.getStoreSize())); + + if (EVT == MVT::f32) { + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad); + SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector, + InsertpsMask); + } else { // EVT == MVT::i32 + // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT + // instruction, to match the PINSRD instruction, which loads an i32 to a + // certain vector element. + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad, + DAG.getConstant(DestIndex, MVT::i32)); + } + } + + // Vector-element-to-vector + unsigned SrcIndex = Mask[DestIndex] % 4; + SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); +} + // Reduce a vector shuffle to zext. static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { @@ -7674,6 +7775,9 @@ if (BlendOp.getNode()) return BlendOp; + if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) + return getINSERTPS(SVOp, dl, DAG); + unsigned Imm8; if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 -; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK @g16 = external global i16 @@ -249,3 +249,74 @@ ; X64: ret } +define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +entry: + %0 = load <4 x float>* %pb, align 16 + %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> + ret <4 x float> %vecinit6 +; CHECK-LABEL: insertps_from_shufflevector_1: +; CHECK-NOT: shufps +; CHECK: insertps $48, +; CHECK: ret +} + +define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) { +entry: + %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + ret <4 x float> %vecinit6 +; CHECK-LABEL: insertps_from_shufflevector_2: +; CHECK-NOT: mov +; CHECK-NOT: shufps +; CHECK: insertps $96, +; CHECK: ret +} + +; For loading an i32 from memory into an xmm register we use pinsrd +; instead of insertps +define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) { +entry: + %0 = load <4 x i32>* %pb, align 16 + %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> + ret <4 x i32> %vecinit6 +; CHECK-LABEL: pinsrd_from_shufflevector_i32: +; CHECK-NOT: mov +; CHECK-NOT: shufps +; CHECK: pinsrd $3, +; CHECK: ret +} + +define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) { +entry: + %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %vecinit6 +; CHECK-LABEL: insertps_from_shufflevector_i32_2: +; CHECK-NOT: mov +; CHECK-NOT: shufps +; CHECK: insertps $208, +; CHECK: ret +} + +define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) { +; CHECK-LABEL: insertps_from_load_ins_elt_undef: +; CHECK-NOT: mov +; CHECK-NOT: shufps +; CHECK: insertps $16, +; CHECK: ret + %1 = load float* %b, align 4 + %2 = insertelement <4 x float> undef, float %1, i32 0 + %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> + ret <4 x float> %result +} + +define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { +; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32: +; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr +;; aCHECK-NOT: mov +; CHECK-NOT: shufps +; CHECK: insertps $32, +; CHECK: ret + %1 = load i32* %b, align 4 + %2 = insertelement <4 x i32> undef, i32 %1, i32 0 + %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> + ret <4 x i32> %result +}