Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7402,6 +7402,23 @@ getShuffleSHUFImmediate(SVOp), DAG); } +static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, + SelectionDAG &DAG) { + SDLoc dl(Load); + MVT VT = Load->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue Addr = Load->getOperand(1); + SDValue NewAddr = DAG.getNode( + ISD::ADD, dl, Addr.getSimpleValueType(), Addr, + DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType())); + + SDValue NewLoad = + DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Load->getMemOperand(), 0, EVT.getStoreSize())); + return NewLoad; +} + // It is only safe to call this function if isINSERTPSMask is true for // this shufflevector mask. static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, @@ -7446,17 +7463,10 @@ // Trivial case, when From comes from a load and is only used by the // shuffle. Make it use insertps from the vector that we need from that // load. - SDValue Addr = From.getOperand(1); - SDValue NewAddr = - DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr, - DAG.getConstant(DestIndex * EVT.getStoreSize(), - Addr.getSimpleValueType())); - - LoadSDNode *Load = cast(From); SDValue NewLoad = - DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, - DAG.getMachineFunction().getMachineMemOperand( - Load->getMemOperand(), 0, EVT.getStoreSize())); + NarrowVectorLoadToElement(cast(From), DestIndex, DAG); + if (!NewLoad.getNode()) + return SDValue(); if (EVT == MVT::f32) { // Create this as a scalar to vector to match the instruction pattern. @@ -20093,6 +20103,48 @@ return SDValue(); } +static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + MVT VT = N->getOperand(1)->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + if (VT != MVT::v4f32) + return SDValue(); + + SDValue Ld = N->getOperand(1); + if (MayFoldLoad(Ld)) { + unsigned DestIndex = cast(N->getOperand(2))->getZExtValue(); + Ld = NarrowVectorLoadToElement(cast(Ld), DestIndex, DAG); + if (!Ld.getNode()) + return SDValue(); + } else if (Ld->getOpcode() == X86ISD::VBROADCAST) { + Ld = Ld->getOperand(0); + if (!MayFoldLoad(Ld)) + return SDValue(); + } else if (Ld->getOpcode() == X86ISD::PSHUFD) { + // SSE4.1 without AVX wil end up lowering broadcasts as a pshufd of a + // scalar_to_vector, with imm = 0. Special case this version. + if (cast(Ld->getOperand(1))->isNullValue()) { + Ld = Ld->getOperand(0); + if (Ld->getOpcode() == ISD::SCALAR_TO_VECTOR && + MayFoldLoad(Ld->getOperand(0))) + Ld = Ld->getOperand(0); + else + return SDValue(); + } else + return SDValue(); + } else + return SDValue(); + + assert(EVT == MVT::f32); + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); + // countS bits are ignored when loading from memory on insertps, which + // means we don't need to explicitly set them to 0. + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), + LoadScalarToVector, N->getOperand(2)); +} + // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. @@ -20396,6 +20448,8 @@ case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); + case X86ISD::INSERTPS: + return PerformINSERTPSCombine(N, DAG, Subtarget); } return SDValue(); Index: test/CodeGen/X86/fold-load-vec.ll =================================================================== --- test/CodeGen/X86/fold-load-vec.ll +++ test/CodeGen/X86/fold-load-vec.ll @@ -5,7 +5,7 @@ ; loads from m32. define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind { ; CHECK: sample_test -; CHECK: movaps +; CHECK-NOT: movaps ; CHECK: insertps entry: %source.addr = alloca <4 x float>*, align 8 Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -576,3 +576,33 @@ %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5 ret <4 x float> %res } + +define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) + ret <4 x float> %2 +} + +define <4 x float> @insertps_from_broadcast(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +}