Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10550,16 +10550,29 @@ } if (EltVT == MVT::f32) { - // Bits [7:6] of the constant are the source select. This will always be - // zero here. The DAG Combiner may combine an extract_elt index into - // these - // bits. For example (insert (extract, 3), 2) could be matched by - // putting - // the '3' into bits [7:6] of X86ISD::INSERTPS. - // Bits [5:4] of the constant are the destination select. This is the - // value of the incoming immediate. - // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // Bits [7:6] of the constant are the source select. This will always be + // zero here. The DAG Combiner may combine an extract_elt index into + // these bits. For example (insert (extract, 3), 2) could be matched by + // putting the '3' into bits [7:6] of X86ISD::INSERTPS. + // Bits [5:4] of the constant are the destination select. This is the + // value of the incoming immediate. + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. + + const Function *F = DAG.getMachineFunction().getFunction(); + bool MinSize = F->hasFnAttribute(Attribute::MinSize); + if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { + // If this is an insertion of 32-bits into the low 32-bits of + // a vector, we prefer to generate a blend with immediate rather + // than an insertps. Blends are simpler operations in hardware and so + // will always have equal or better performance than insertps. + // But if optimizing for size and there's a load folding opportunity, + // generate insertps because blendps does not have a 32-bit memory + // operand form. + N2 = DAG.getIntPtrConstant(1); + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); + } N2 = DAG.getIntPtrConstant(IdxVal << 4); // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -199,28 +199,51 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone -define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind { -; X32-LABEL: insertps_2: +; When optimizing for speed, prefer blendps over insertps even if it means we have to +; generate a separate movss to load the scalar operand. +define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind { +; X32-LABEL: blendps_not_insertps_1: +; X32: ## BB#0: +; X32-NEXT: movss {{.*#+}} xmm1 +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: blendps_not_insertps_1: +; X64: ## BB#0: +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: retq + %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 + ret <4 x float> %tmp1 +} + +; When optimizing for size, generate an insertps if there's a load fold opportunity. +; The difference between i386 and x86-64 ABIs for the float operand means we should +; generate an insertps for X32 but not for X64! +define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind { +; X32-LABEL: insertps_or_blendps: ; X32: ## BB#0: ; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] ; X32-NEXT: retl ; -; X64-LABEL: insertps_2: +; X64-LABEL: insertps_or_blendps: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X64-NEXT: retq %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 ret <4 x float> %tmp1 } -define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind { -; X32-LABEL: insertps_3: + +; An insert into the low 32-bits of a vector from the low 32-bits of another vector +; is always just a blendps because blendps is never more expensive than insertps. +define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind { +; X32-LABEL: blendps_not_insertps_2: ; X32: ## BB#0: -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X32-NEXT: retl ; -; X64-LABEL: insertps_3: +; X64-LABEL: blendps_not_insertps_2: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X64-NEXT: retq %tmp2 = extractelement <4 x float> %t2, i32 0 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0