Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18675,6 +18675,61 @@ return SDValue(); } +/// If a vector binop is performed on build vector operands that only have one +/// non-undef element, it may be profitable to extract, scalarize, and insert. +static SDValue scalarizeBinOpOfBuildVectors(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + if (N0.getOpcode() != ISD::BUILD_VECTOR || N0.getOpcode() != N1.getOpcode()) + return SDValue(); + + // Return the index of exactly one scalar element in an otherwise undefined + // build vector. + auto getScalarIndex = [](SDValue V) { + int NotUndefIndex = -1; + for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) { + // Ignore undef elements. + if (V.getOperand(i).isUndef()) + continue; + // There can be only one. + if (NotUndefIndex >= 0) + return -1; + // This might be the only non-undef operand. + NotUndefIndex = i; + } + return NotUndefIndex; + }; + int N0Index = getScalarIndex(N0); + if (N0Index == -1) + return SDValue(); + int N1Index = getScalarIndex(N1); + if (N1Index == -1) + return SDValue(); + + SDValue X = N0.getOperand(N0Index); + SDValue Y = N1.getOperand(N1Index); + EVT ScalarVT = X.getValueType(); + if (ScalarVT != Y.getValueType()) + return SDValue(); + + // TODO: Remove/replace the extract cost check? If the elements are available + // as scalars, then there may be no extract cost. Should we ask if + // inserting a scalar back into a vector is cheap instead? + EVT VT = N->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (N0Index != N1Index || !TLI.isExtractVecEltCheap(VT, N0Index) || + !TLI.isOperationLegalOrCustom(N->getOpcode(), ScalarVT)) + return SDValue(); + + // bo (build_vec ...undef, x, undef...), (build_vec ...undef, y, undef...) --> + // build_vec ...undef, (bo x, y), undef... + SDValue ScalarBO = DAG.getNode(N->getOpcode(), SDLoc(N), ScalarVT, X, Y, + N->getFlags()); + SmallVector Ops(N0.getNumOperands(), DAG.getUNDEF(ScalarVT)); + Ops[N0Index] = ScalarBO; + return DAG.getBuildVector(VT, SDLoc(N), Ops); +} + /// Visit a binary vector operation, like ADD. SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { assert(N->getValueType(0).isVector() && @@ -18737,6 +18792,9 @@ } } + if (SDValue V = scalarizeBinOpOfBuildVectors(N, DAG)) + return V; + return SDValue(); } Index: llvm/trunk/test/CodeGen/X86/scalarize-fp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/scalarize-fp.ll +++ llvm/trunk/test/CodeGen/X86/scalarize-fp.ll @@ -5,14 +5,12 @@ define <4 x float> @fadd_op1_constant_v4f32(float %x) nounwind { ; SSE-LABEL: fadd_op1_constant_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: addss {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fadd_op1_constant_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x float> undef, float %x, i32 0 %b = fadd <4 x float> %v, @@ -22,16 +20,14 @@ define <4 x float> @load_fadd_op1_constant_v4f32(float* %p) nounwind { ; SSE-LABEL: load_fadd_op1_constant_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: addss {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: load_fadd_op1_constant_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %x = load float, float* %p %v = insertelement <4 x float> undef, float %x, i32 0 @@ -43,14 +39,14 @@ ; SSE-LABEL: fsub_op0_constant_v4f32: ; SSE: # %bb.0: ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: subps %xmm0, %xmm1 +; SSE-NEXT: subss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fsub_op0_constant_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x float> undef, float %x, i32 0 %b = fsub <4 x float> , %v @@ -60,16 +56,14 @@ define <4 x float> @load_fsub_op0_constant_v4f32(float* %p) nounwind { ; SSE-LABEL: load_fsub_op0_constant_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: subps %xmm1, %xmm0 +; SSE-NEXT: subss (%rdi), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: load_fsub_op0_constant_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vsubss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %x = load float, float* %p %v = insertelement <4 x float> undef, float %x, i32 0 @@ -80,14 +74,12 @@ define <4 x float> @fmul_op1_constant_v4f32(float %x) nounwind { ; SSE-LABEL: fmul_op1_constant_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: mulss {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fmul_op1_constant_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x float> undef, float %x, i32 0 %b = fmul <4 x float> %v, @@ -97,16 +89,14 @@ define <4 x float> @load_fmul_op1_constant_v4f32(float* %p) nounwind { ; SSE-LABEL: load_fmul_op1_constant_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: mulss {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: load_fmul_op1_constant_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %x = load float, float* %p %v = insertelement <4 x float> undef, float %x, i32 0 @@ -117,14 +107,12 @@ define <4 x float> @fdiv_op1_constant_v4f32(float %x) nounwind { ; SSE-LABEL: fdiv_op1_constant_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: divps %xmm1, %xmm0 +; SSE-NEXT: divss {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fdiv_op1_constant_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vdivps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vdivss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x float> undef, float %x, i32 0 %b = fdiv <4 x float> %v, @@ -135,15 +123,13 @@ ; SSE-LABEL: load_fdiv_op1_constant_v4f32: ; SSE: # %bb.0: ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: divps %xmm1, %xmm0 +; SSE-NEXT: divss {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: load_fdiv_op1_constant_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vdivps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vdivss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %x = load float, float* %p %v = insertelement <4 x float> undef, float %x, i32 0 @@ -155,14 +141,14 @@ ; SSE-LABEL: fdiv_op0_constant_v4f32: ; SSE: # %bb.0: ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: divps %xmm0, %xmm1 +; SSE-NEXT: divss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fdiv_op0_constant_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x float> undef, float %x, i32 0 %b = fdiv <4 x float> , %v @@ -172,16 +158,14 @@ define <4 x float> @load_fdiv_op0_constant_v4f32(float* %p) nounwind { ; SSE-LABEL: load_fdiv_op0_constant_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: divps %xmm1, %xmm0 +; SSE-NEXT: divss (%rdi), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: load_fdiv_op0_constant_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vdivss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %x = load float, float* %p %v = insertelement <4 x float> undef, float %x, i32 0 @@ -192,14 +176,12 @@ define <4 x double> @fadd_op1_constant_v4f64(double %x) nounwind { ; SSE-LABEL: fadd_op1_constant_v4f64: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fadd_op1_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fadd <4 x double> %v, @@ -209,16 +191,14 @@ define <4 x double> @load_fadd_op1_constant_v4f64(double* %p) nounwind { ; SSE-LABEL: load_fadd_op1_constant_v4f64: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: load_fadd_op1_constant_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 @@ -230,14 +210,14 @@ ; SSE-LABEL: fsub_op0_constant_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: subpd %xmm0, %xmm1 +; SSE-NEXT: subsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fsub_op0_constant_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fsub <4 x double> , %v @@ -247,16 +227,14 @@ define <4 x double> @load_fsub_op0_constant_v4f64(double* %p) nounwind { ; SSE-LABEL: load_fsub_op0_constant_v4f64: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: subpd %xmm1, %xmm0 +; SSE-NEXT: subsd (%rdi), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: load_fsub_op0_constant_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vsubsd (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 @@ -267,14 +245,12 @@ define <4 x double> @fmul_op1_constant_v4f64(double %x) nounwind { ; SSE-LABEL: fmul_op1_constant_v4f64: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: mulsd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fmul_op1_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fmul <4 x double> %v, @@ -284,16 +260,14 @@ define <4 x double> @load_fmul_op1_constant_v4f64(double* %p) nounwind { ; SSE-LABEL: load_fmul_op1_constant_v4f64: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: mulsd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: load_fmul_op1_constant_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 @@ -304,14 +278,12 @@ define <4 x double> @fdiv_op1_constant_v4f64(double %x) nounwind { ; SSE-LABEL: fdiv_op1_constant_v4f64: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: divpd %xmm1, %xmm0 +; SSE-NEXT: divsd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fdiv_op1_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vdivsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fdiv <4 x double> %v, @@ -322,15 +294,13 @@ ; SSE-LABEL: load_fdiv_op1_constant_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: divpd %xmm1, %xmm0 +; SSE-NEXT: divsd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: load_fdiv_op1_constant_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vdivsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 @@ -342,14 +312,14 @@ ; SSE-LABEL: fdiv_op0_constant_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: divpd %xmm0, %xmm1 +; SSE-NEXT: divsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fdiv_op0_constant_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fdiv <4 x double> , %v @@ -359,16 +329,14 @@ define <4 x double> @load_fdiv_op0_constant_v4f64(double* %p) nounwind { ; SSE-LABEL: load_fdiv_op0_constant_v4f64: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: divpd %xmm1, %xmm0 +; SSE-NEXT: divsd (%rdi), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: load_fdiv_op0_constant_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vdivsd (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0