Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10473,6 +10473,10 @@ } if (EltVT == MVT::f32) { + // FIXME: We should generate a BLENDI here if we're just inserting from + // and to the low lane and not zeroing (IdxVal == 0). + // BLENDPS has better performance than INSERTPS in that case. + // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into // these @@ -22947,17 +22951,19 @@ MVT VT = N->getOperand(1)->getSimpleValueType(0); assert((VT == MVT::v4f32 || VT == MVT::v4i32) && "X86insertps is only defined for v4x32"); - - SDValue Ld = N->getOperand(1); - if (MayFoldLoad(Ld)) { + + uint64_t Imm8 = cast(N->getOperand(2))->getZExtValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (MayFoldLoad(N1)) { // Extract the countS bits from the immediate so we can get the proper // address when narrowing the vector load to a specific element. // When the second source op is a memory address, insertps doesn't use // countS and just gets an f32 from that address. - unsigned DestIndex = - cast(N->getOperand(2))->getZExtValue() >> 6; + unsigned DestIdx = Imm8 >> 6; - Ld = NarrowVectorLoadToElement(cast(Ld), DestIndex, DAG); + SDValue Ld = NarrowVectorLoadToElement(cast(N1), DestIdx, DAG); // Create this as a scalar to vector to match the instruction pattern. SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); @@ -22966,6 +22972,27 @@ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), LoadScalarToVector, N->getOperand(2)); } + + // A register/register insertps that is just moving the low 32-bits to the + // corresponding location in the destination register can be simplified into + // a blendps. Blendps should have equal or better performance because it's a + // simpler operation. + // This also allows us to eliminate some pattern-matching possibilities for + // scalar SSE math ops that are performed in xmm registers and then shuffled. + + // FIXME: If optimizing for size and there is a load folding opportunity, + // we should either not do this transform or we should undo it in + // PerformBLENDICombine. The above check for "MayFoldLoad" doesn't work + // because it doesn't look through a SCALAR_TO_VECTOR node. + + if (Imm8 == 0x00) { + // We do not convert insertps nodes if they are operating on anything + // other than the low element of the vector because that might cause an + // extra shuffle operation to be created. + SDValue NewMask = DAG.getConstant(0x01, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, NewMask); + } + return SDValue(); } Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -3179,13 +3179,6 @@ // With SSE 4.1, insertps/blendi are preferred to movsd, so match those too. let Predicates = [UseSSE41] in { - // extracted scalar math op with insert via insertps - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32:$src, VR128))>; - // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), @@ -3203,13 +3196,6 @@ // Repeat everything for AVX, except for the movss + scalar combo... // because that one shouldn't occur with AVX codegen? let Predicates = [HasAVX] in { - // extracted scalar math op with insert via insertps - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32:$src, VR128))>; - // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), Index: test/CodeGen/X86/avx-load-store.ll =================================================================== --- test/CodeGen/X86/avx-load-store.ll +++ test/CodeGen/X86/avx-load-store.ll @@ -23,20 +23,36 @@ declare void @dummy(<4 x double>, <8 x float>, <4 x i64>) -;; -;; The two tests below check that we must fold load + scalar_to_vector -;; + ins_subvec+ zext into only a single vmovss or vmovsd or vinsertps from memory +; Although this could have a load folded vinsertps, we prefer +; to use vmovss + vblendps because it has better performance. -; CHECK: mov00 -define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind { +; CHECK-LABEL: mov_blendps: +define <8 x float> @mov_blendps(<8 x float> %v, float * %ptr) nounwind { %val = load float, float* %ptr -; CHECK: vinsertps +; CHECK: vmovss +; CHECK: vblendps ; CHECK: vinsertf128 %i0 = insertelement <8 x float> zeroinitializer, float %val, i32 0 ret <8 x float> %i0 ; CHECK: ret } +; Use vinsertps to load a scalar into a higher lane because there is no +; version of vblendps that loads a scalar. Transferring out of the low +; lane after a vmovss would require another shuffle operation. + +; CHECK-LABEL: mov_insertps: +define <4 x float> @mov_insertps(<4 x float> %v, float * %ptr) nounwind { + %val = load float, float* %ptr +; CHECK: vinsertps $29, (%rdi), %xmm0, %xmm0 + %i0 = insertelement <4 x float> zeroinitializer, float %val, i32 1 + ret <4 x float> %i0 +; CHECK: ret +} + +;; This test checks that we must fold load + ins_subvec + zext +;; into only a single vmovlpd from memory + ; CHECK: mov01 define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind { %val = load double, double* %ptr Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -199,28 +199,33 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone -define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind { -; X32-LABEL: insertps_2: +; In cases where either blendps or insertps will do the job, +; prefer blendps because it has better performance. + +define <4 x float> @blendps_1(<4 x float> %t1, float %t2) nounwind { +; X32-LABEL: blendps_1: ; X32: ## BB#0: -; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] +; X32-NEXT: movss 4(%esp), {{.*}} mem[0],zero,zero,zero +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm{{.*}}[0],xmm0[1,2,3] ; X32-NEXT: retl ; -; X64-LABEL: insertps_2: +; X64-LABEL: blendps_1: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X64-NEXT: retq %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 ret <4 x float> %tmp1 } -define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind { -; X32-LABEL: insertps_3: + +define <4 x float> @blendps_2(<4 x float> %t1, <4 x float> %t2) nounwind { +; X32-LABEL: blendps_2: ; X32: ## BB#0: -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X32-NEXT: retl ; -; X64-LABEL: insertps_3: +; X64-LABEL: blendps_2: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X64-NEXT: retq %tmp2 = extractelement <4 x float> %t2, i32 0 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0