Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10438,6 +10438,9 @@ } if (EltVT == MVT::f32) { + // FIXME: We should generate a BLENDI here if we're not crossing lanes. + // BLENDPS has better performance than INSERTPS. + // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into // these @@ -22897,17 +22900,19 @@ MVT VT = N->getOperand(1)->getSimpleValueType(0); assert((VT == MVT::v4f32 || VT == MVT::v4i32) && "X86insertps is only defined for v4x32"); - - SDValue Ld = N->getOperand(1); - if (MayFoldLoad(Ld)) { + + auto Imm8 = cast(N->getOperand(2))->getZExtValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (MayFoldLoad(N1)) { // Extract the countS bits from the immediate so we can get the proper // address when narrowing the vector load to a specific element. // When the second source op is a memory address, insertps doesn't use // countS and just gets an f32 from that address. - unsigned DestIndex = - cast(N->getOperand(2))->getZExtValue() >> 6; + auto DestIndex = Imm8 >> 6; - Ld = NarrowVectorLoadToElement(cast(Ld), DestIndex, DAG); + auto Ld = NarrowVectorLoadToElement(cast(N1), DestIndex, DAG); // Create this as a scalar to vector to match the instruction pattern. SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); @@ -22916,7 +22921,41 @@ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), LoadScalarToVector, N->getOperand(2)); } - return SDValue(); + + // A register/register insertps that is just moving 32-bits to the + // corresponding location in the destination register can be simplified into + // a blendps. Both instructions are vector permutes, but blendps + // may have superior performance because it's a simpler operation. + // This also allows us to eliminate some pattern-matching possibilities for + // scalar SSE math ops that are performed in xmm registers and then shuffled. + + // FIXME: If optimizing for size and there is a load folding opportunity, + // we should either not do this transform or we should undo it in + // PerformBLENDICombine. The above check for "MayFoldLoad" doesn't work + // because it doesn't look through a SCALAR_TO_VECTOR node. + + switch (Imm8) { + default: return SDValue(); + + // The insertps immediate for the register/register variant is: + // Bits [7:6] - select exactly one of four 32-bit source lanes + // Bits [5:4] - select exactly one of four 32-bit destination lanes + // Bits [3:0] - zero mask bonus operation + + // The blendps immediate is: + // Bits [3:0] - if a bit is set, copy the 32-bit source lane to the + // corresponding destination lane. + + // To do this transform, we need the source select bits [7:6] to match + // the destination select bits [5:4] and zero mask bits [3:0] must be off. + + case 0x00: Imm8 = 0x01; break; // copy src bits [31:0] to dest + case 0x50: Imm8 = 0x02; break; // copy src bits [63:32] to dest + case 0xA0: Imm8 = 0x04; break; // copy src bits [95:64] to dest + case 0xF0: Imm8 = 0x08; break; // copy src bits [127:96] to dest + } + SDValue NewMask = DAG.getConstant(Imm8, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, NewMask); } static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -3179,13 +3179,6 @@ // With SSE 4.1, insertps/blendi are preferred to movsd, so match those too. let Predicates = [UseSSE41] in { - // extracted scalar math op with insert via insertps - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32:$src, VR128))>; - // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), @@ -3202,14 +3195,7 @@ // Repeat everything for AVX, except for the movss + scalar combo... // because that one shouldn't occur with AVX codegen? - let Predicates = [HasAVX] in { - // extracted scalar math op with insert via insertps - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32:$src, VR128))>; - + let Predicates = [HasAVX] in { // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), Index: test/CodeGen/X86/avx-load-store.ll =================================================================== --- test/CodeGen/X86/avx-load-store.ll +++ test/CodeGen/X86/avx-load-store.ll @@ -23,20 +23,25 @@ declare void @dummy(<4 x double>, <8 x float>, <4 x i64>) -;; -;; The two tests below check that we must fold load + scalar_to_vector -;; + ins_subvec+ zext into only a single vmovss or vmovsd or vinsertps from memory + +; Although this could have a load folded vinsertps, we prefer +; to use vblendps because it has better performance. +; FIXME: If optimizing for size, we should generate a vinsertps. ; CHECK: mov00 define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind { %val = load float* %ptr -; CHECK: vinsertps +; CHECK: vblendps ; CHECK: vinsertf128 %i0 = insertelement <8 x float> zeroinitializer, float %val, i32 0 ret <8 x float> %i0 ; CHECK: ret } +;; +;; This test checks that we must fold load +;; + ins_subvec + zext into only a single vmovlpd from memory + ; CHECK: mov01 define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind { %val = load double* %ptr Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -199,28 +199,36 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone -define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind { -; X32-LABEL: insertps_2: +; In cases where either blendps or insertps will do the job, +; prefer blendps because it has better performance. + +; FIXME: If optimizing for size and there is a load folding opportunity, +; we should generate a vinsertps. + +define <4 x float> @blendps_1(<4 x float> %t1, float %t2) nounwind { +; X32-LABEL: blendps_1: ; X32: ## BB#0: -; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] +; X32-NEXT: movss 4(%esp), {{.*}} mem[0],zero,zero,zero +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm{{.*}}[0],xmm0[1,2,3] ; X32-NEXT: retl ; -; X64-LABEL: insertps_2: +; X64-LABEL: blendps_1: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X64-NEXT: retq %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 ret <4 x float> %tmp1 } -define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind { -; X32-LABEL: insertps_3: + +define <4 x float> @blendps_2(<4 x float> %t1, <4 x float> %t2) nounwind { +; X32-LABEL: blendps_2: ; X32: ## BB#0: -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X32-NEXT: retl ; -; X64-LABEL: insertps_3: +; X64-LABEL: blendps_2: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X64-NEXT: retq %tmp2 = extractelement <4 x float> %t2, i32 0 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0