Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -2441,6 +2441,14 @@ return false; } + /// Return true if extraction of a scalar element from the given vector type + /// at the given index is cheap. For example, if scalar operations occur on + /// the same register file as vector operations, then an extract element may + /// be a sub-register rename rather than an actual instruction. + virtual bool isExtractVecEltCheap(EVT VT, unsigned Index) const { + return false; + } + /// Try to convert math with an overflow comparison into the corresponding DAG /// node operation. Targets may want to override this independently of whether /// the operation is legal/custom for the given type because it may obscure Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18125,6 +18125,22 @@ NewBV = DAG.getBitcast(VT, NewBV); return NewBV; } + int SplatIndex = SVN->getSplatIndex(); + if (TLI.isExtractVecEltCheap(VT, SplatIndex) && + ISD::isBinaryOp(N0.getNode())) { + // splat (vector_bo L, R), Index --> + // splat (scalar_bo (extelt L, Index), (extelt R, Index)) + SDValue L = N0.getOperand(0), R = N0.getOperand(1); + SDLoc DL(N); + EVT EltVT = VT.getScalarType(); + SDValue Index = DAG.getIntPtrConstant(SplatIndex, DL); + SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index); + SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index); + SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR); + SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO); + SmallVector ZeroMask(VT.getVectorNumElements(), 0); + return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask); + } } // Simplify source operands based on shuffle mask. Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -1074,6 +1074,12 @@ /// supported. bool shouldScalarizeBinop(SDValue) const override; + /// Extract of a scalar FP value from index 0 of a vector is free. + bool isExtractVecEltCheap(EVT VT, unsigned Index) const override { + EVT EltVT = VT.getScalarType(); + return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; + } + /// Overflow nodes should get combined/lowered to optimal instructions /// (they should allow eliminating explicit compares by getting flags from /// math ops). Index: llvm/test/CodeGen/X86/haddsub-shuf.ll =================================================================== --- llvm/test/CodeGen/X86/haddsub-shuf.ll +++ llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -274,7 +274,7 @@ ; SSSE3_SLOW: # %bb.0: ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSSE3_SLOW-NEXT: retq ; @@ -286,7 +286,7 @@ ; AVX1_SLOW-LABEL: hadd_v2f64: ; AVX1_SLOW: # %bb.0: ; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1_SLOW-NEXT: retq ; @@ -298,7 +298,7 @@ ; AVX2_SLOW-LABEL: hadd_v2f64: ; AVX2_SLOW: # %bb.0: ; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX2_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX2_SLOW-NEXT: retq ; @@ -398,12 +398,12 @@ ; SSSE3_SLOW: # %bb.0: ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3 -; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSSE3_SLOW-NEXT: addpd %xmm1, %xmm3 -; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm2 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0] +; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSSE3_SLOW-NEXT: addsd %xmm1, %xmm2 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0] ; SSSE3_SLOW-NEXT: retq ; ; SSSE3_FAST-LABEL: hadd_v4f64: @@ -447,7 +447,7 @@ ; SSSE3_SLOW: # %bb.0: ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3_SLOW-NEXT: subpd %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: subsd %xmm1, %xmm0 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSSE3_SLOW-NEXT: retq ; @@ -459,7 +459,7 @@ ; AVX1_SLOW-LABEL: hsub_v2f64: ; AVX1_SLOW: # %bb.0: ; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1_SLOW-NEXT: retq ; @@ -471,7 +471,7 @@ ; AVX2_SLOW-LABEL: hsub_v2f64: ; AVX2_SLOW: # %bb.0: ; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX2_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX2_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX2_SLOW-NEXT: retq ; @@ -491,11 +491,11 @@ ; SSSE3_SLOW: # %bb.0: ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3 -; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSSE3_SLOW-NEXT: subpd %xmm3, %xmm1 -; SSSE3_SLOW-NEXT: subpd %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm0 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm1 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] ; SSSE3_SLOW-NEXT: retq ; Index: llvm/test/CodeGen/X86/scalarize-fp.ll =================================================================== --- llvm/test/CodeGen/X86/scalarize-fp.ll +++ llvm/test/CodeGen/X86/scalarize-fp.ll @@ -379,13 +379,13 @@ define <2 x double> @fadd_splat_splat_v2f64(<2 x double> %vx, <2 x double> %vy) { ; SSE-LABEL: fadd_splat_splat_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] ; SSE-NEXT: retq ; ; AVX-LABEL: fadd_splat_splat_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %splatx = shufflevector <2 x double> %vx, <2 x double> undef, <2 x i32> zeroinitializer @@ -397,14 +397,14 @@ define <4 x double> @fsub_splat_splat_v4f64(double %x, double %y) { ; SSE-LABEL: fsub_splat_splat_v4f64: ; SSE: # %bb.0: -; SSE-NEXT: subpd %xmm1, %xmm0 +; SSE-NEXT: subsd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: fsub_splat_splat_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -419,13 +419,13 @@ define <4 x float> @fmul_splat_splat_v4f32(<4 x float> %vx, <4 x float> %vy) { ; SSE-LABEL: fmul_splat_splat_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX-LABEL: fmul_splat_splat_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: retq %splatx = shufflevector <4 x float> %vx, <4 x float> undef, <4 x i32> zeroinitializer @@ -437,26 +437,14 @@ define <8 x float> @fdiv_splat_splat_v8f32(<8 x float> %vx, <8 x float> %vy) { ; SSE-LABEL: fdiv_splat_splat_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: rcpps %xmm2, %xmm3 -; SSE-NEXT: mulps %xmm3, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: subps %xmm2, %xmm1 -; SSE-NEXT: mulps %xmm3, %xmm1 -; SSE-NEXT: addps %xmm3, %xmm1 -; SSE-NEXT: mulps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: divss %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: fdiv_splat_splat_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vrcpps %ymm1, %ymm2 -; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -569,7 +557,7 @@ ; SSE-LABEL: fsub_const_op0_splat_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: subpd %xmm0, %xmm1 +; SSE-NEXT: subsd %xmm0, %xmm1 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq @@ -577,7 +565,7 @@ ; AVX-LABEL: fsub_const_op0_splat_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -667,13 +655,13 @@ define <2 x double> @splat0_fadd_v2f64(<2 x double> %vx, <2 x double> %vy) { ; SSE-LABEL: splat0_fadd_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] ; SSE-NEXT: retq ; ; AVX-LABEL: splat0_fadd_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %b = fadd <2 x double> %vx, %vy @@ -684,14 +672,14 @@ define <4 x double> @splat0_fsub_v4f64(double %x, double %y) { ; SSE-LABEL: splat0_fsub_v4f64: ; SSE: # %bb.0: -; SSE-NEXT: subpd %xmm1, %xmm0 +; SSE-NEXT: subsd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: splat0_fsub_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -705,13 +693,13 @@ define <4 x float> @splat0_fmul_v4f32(<4 x float> %vx, <4 x float> %vy) { ; SSE-LABEL: splat0_fmul_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX-LABEL: splat0_fmul_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: retq %b = fmul fast <4 x float> %vx, %vy @@ -722,26 +710,14 @@ define <8 x float> @splat0_fdiv_v8f32(<8 x float> %vx, <8 x float> %vy) { ; SSE-LABEL: splat0_fdiv_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: rcpps %xmm2, %xmm3 -; SSE-NEXT: mulps %xmm3, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: subps %xmm2, %xmm1 -; SSE-NEXT: mulps %xmm3, %xmm1 -; SSE-NEXT: addps %xmm3, %xmm1 -; SSE-NEXT: mulps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: divss %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: splat0_fdiv_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vrcpps %ymm1, %ymm2 -; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -753,16 +729,13 @@ define <2 x double> @splat0_fadd_const_op1_v2f64(<2 x double> %vx) { ; SSE-LABEL: splat0_fadd_const_op1_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] ; SSE-NEXT: retq ; ; AVX-LABEL: splat0_fadd_const_op1_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %b = fadd <2 x double> %vx, @@ -774,7 +747,7 @@ ; SSE-LABEL: splat0_fsub_const_op0_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: subpd %xmm0, %xmm1 +; SSE-NEXT: subsd %xmm0, %xmm1 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq @@ -782,7 +755,7 @@ ; AVX-LABEL: splat0_fsub_const_op0_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -795,16 +768,13 @@ define <4 x float> @splat0_fmul_const_op1_v4f32(<4 x float> %vx) { ; SSE-LABEL: splat0_fmul_const_op1_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: mulps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: mulss {{.*}}(%rip), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX-LABEL: splat0_fmul_const_op1_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: retq %b = fmul fast <4 x float> %vx, @@ -821,13 +791,6 @@ ; ; AVX-LABEL: splat0_fdiv_const_op1_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vrcpps %ymm1, %ymm1 -; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-NEXT: vsubps %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -839,24 +802,16 @@ define <8 x float> @splat0_fdiv_const_op0_v8f32(<8 x float> %vx) { ; SSE-LABEL: splat0_fdiv_const_op0_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: subps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: addps %xmm2, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: divss %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splat0_fdiv_const_op0_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vrcpps %ymm0, %ymm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq