Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7412,6 +7412,23 @@ getShuffleSHUFImmediate(SVOp), DAG); } +static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, + SelectionDAG &DAG) { + SDLoc dl(Load); + MVT VT = Load->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue Addr = Load->getOperand(1); + SDValue NewAddr = DAG.getNode( + ISD::ADD, dl, Addr.getSimpleValueType(), Addr, + DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType())); + + SDValue NewLoad = + DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Load->getMemOperand(), 0, EVT.getStoreSize())); + return NewLoad; +} + // It is only safe to call this function if isINSERTPSMask is true for // this shufflevector mask. static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, @@ -7423,7 +7440,6 @@ // If we're transferring an i32 from memory to a specific element in a // register, we output a generic DAG that will match the PINSRD // instruction. - // TODO: Optimize for AVX cases too (VINSERTPS) MVT VT = SVOp->getSimpleValueType(0); MVT EVT = VT.getVectorElementType(); SDValue V1 = SVOp->getOperand(0); @@ -7456,17 +7472,10 @@ // Trivial case, when From comes from a load and is only used by the // shuffle. Make it use insertps from the vector that we need from that // load. - SDValue Addr = From.getOperand(1); - SDValue NewAddr = - DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr, - DAG.getConstant(DestIndex * EVT.getStoreSize(), - Addr.getSimpleValueType())); - - LoadSDNode *Load = cast(From); SDValue NewLoad = - DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, - DAG.getMachineFunction().getMachineMemOperand( - Load->getMemOperand(), 0, EVT.getStoreSize())); + NarrowVectorLoadToElement(cast(From), DestIndex, DAG); + if (!NewLoad.getNode()) + return SDValue(); if (EVT == MVT::f32) { // Create this as a scalar to vector to match the instruction pattern. @@ -20266,6 +20275,29 @@ return SDValue(); } +static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + MVT VT = N->getOperand(1)->getSimpleValueType(0); + assert(VT == MVT::v4f32 || + VT == MVT::v4i32 && "X86insertps is only defined for v4x32"); + + SDValue Ld = N->getOperand(1); + if (MayFoldLoad(Ld)) { + unsigned DestIndex = + cast(N->getOperand(2))->getZExtValue() >> 6; + Ld = NarrowVectorLoadToElement(cast(Ld), DestIndex, DAG); + } else + return SDValue(); + + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); + // countS bits are ignored when loading from memory on insertps, which + // means we don't need to explicitly set them to 0. + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), + LoadScalarToVector, N->getOperand(2)); +} + // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. @@ -20569,6 +20601,8 @@ case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); + case X86ISD::INSERTPS: + return PerformINSERTPSCombine(N, DAG, Subtarget); } return SDValue(); Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -6550,6 +6550,27 @@ defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>; } +let Predicates = [UseSSE41] in + // If we're inserting an element from a load or a null pshuf of a load, + // fold the load into the insertps instruction. + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32 + (scalar_to_vector (loadf32 addr:$src2))), (i8 0)), + imm:$src3)), + (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd + (loadv4f32 addr:$src2), (i8 0)), imm:$src3)), + (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + +let Predicates = [UseAVX] in + // If we're inserting an element from a vbroadcast of a load, fold the + // load into the X86insertps instruction. + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), + (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), + (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), + (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), + (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + //===----------------------------------------------------------------------===// // SSE4.1 - Round Instructions //===----------------------------------------------------------------------===// Index: test/CodeGen/X86/avx2.ll =================================================================== --- test/CodeGen/X86/avx2.ll +++ test/CodeGen/X86/avx2.ll @@ -1,4 +1,7 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 | FileCheck %s -check-prefix=X32 --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s -check-prefix=X64 --check-prefix=CHECK + +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @blendvb_fallback_v4i32 @@ -23,3 +26,111 @@ %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y ret <8 x float> %ret } + +define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) + ret <4 x float> %2 +} + +;; Use a non-zero CountS for insertps +define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load_offset: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +;; Try to match a bit more of the instr, since we need the load's offset. +; CHECK: insertps $96, 4(%{{...}}), % +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) + ret <4 x float> %2 +} + +define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { +; CHECK-LABEL: insertps_from_vector_load_offset_2: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; X32: movl 8(%esp), %ecx +; CHECK-NOT: mov +;; Try to match a bit more of the instr, since we need the load's offset. +; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), % +; CHECK-NEXT: ret + %1 = getelementptr inbounds <4 x float>* %pb, i64 %index + %2 = load <4 x float>* %1, align 16 + %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) + ret <4 x float> %3 +} + +define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast_loadf32: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { +; CHECK-LABEL: insertps_from_broadcast_loadv4f32: +; On X32, account for the arguments' move to registers +; X32: movl 4(%esp), %{{...}} +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %b, align 4 + %2 = extractelement <4 x float> %1, i32 0 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +;; FIXME: We're emitting an extraneous pshufd/vbroadcast. +define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast_multiple_use: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK: vbroadcastss +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: vaddps +; CHECK: vaddps +; CHECK: vaddps +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48) + %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48) + %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48) + %11 = fadd <4 x float> %7, %8 + %12 = fadd <4 x float> %9, %10 + %13 = fadd <4 x float> %11, %12 + ret <4 x float> %13 +} Index: test/CodeGen/X86/fold-load-vec.ll =================================================================== --- test/CodeGen/X86/fold-load-vec.ll +++ test/CodeGen/X86/fold-load-vec.ll @@ -5,7 +5,7 @@ ; loads from m32. define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind { ; CHECK: sample_test -; CHECK: movaps +; CHECK-NOT: movaps ; CHECK: insertps entry: %source.addr = alloca <4 x float>*, align 8 Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -584,3 +584,111 @@ %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y ret <8 x i16> %ret } + +define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) + ret <4 x float> %2 +} + +;; Use a non-zero CountS for insertps +define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load_offset: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +;; Try to match a bit more of the instr, since we need the load's offset. +; CHECK: insertps $96, 4(%{{...}}), % +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) + ret <4 x float> %2 +} + +define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { +; CHECK-LABEL: insertps_from_vector_load_offset_2: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; X32: movl 8(%esp), %ecx +; CHECK-NOT: mov +;; Try to match a bit more of the instr, since we need the load's offset. +; CHECK: insertps $192, 12(%{{...}},%{{...}}), % +; CHECK-NEXT: ret + %1 = getelementptr inbounds <4 x float>* %pb, i64 %index + %2 = load <4 x float>* %1, align 16 + %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) + ret <4 x float> %3 +} + +define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast_loadf32: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { +; CHECK-LABEL: insertps_from_broadcast_loadv4f32: +; On X32, account for the arguments' move to registers +; X32: movl 4(%esp), %{{...}} +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %b, align 4 + %2 = extractelement <4 x float> %1, i32 0 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +;; FIXME: We're emitting an extraneous pshufd/vbroadcast. +define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast_multiple_use: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK: movss +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: addps +; CHECK: addps +; CHECK: addps +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48) + %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48) + %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48) + %11 = fadd <4 x float> %7, %8 + %12 = fadd <4 x float> %9, %10 + %13 = fadd <4 x float> %11, %12 + ret <4 x float> %13 +}