Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9579,6 +9579,13 @@ if (N1CFP && N1CFP->isExactlyValue(+2.0)) return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags); + // fold (fmul X, -2.0) -> (fneg (fadd X, X)) + if (N1CFP && N1CFP->isExactlyValue(-2.0)) + if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) { + SDValue Add = DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags); + return DAG.getNode(ISD::FNEG, DL, VT, Add); + } + // fold (fmul X, -1.0) -> (fneg X) if (N1CFP && N1CFP->isExactlyValue(-1.0)) if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) Index: test/CodeGen/AArch64/fmul-combines.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/fmul-combines.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: test1: +; CHECK: fadd s0, s0, s0 +; CHECK: fneg s0, s0 +define float @test1(float %x) { + %y = fmul float %x, -2.0 + ret float %y +} + +; CHECK-LABEL: test2: +; CHECK: fadd d0, d0, d0 +; CHECK: fneg d0, d0 +define double @test2(double %x) { + %y = fmul double %x, -2.0 + ret double %y +} + +; a * b - 2.0 * c +; CHECK-LABEL: test3: +; CHECK: fmul d0, d0, d1 +; CHECK: fadd d1, d2, d2 +; CHECK: fsub d0, d0, d1 +define double @test3(double %a, double %b, double %d) { +entry: + %mul = fmul double %a, %b + %mul1 = fmul double %d, 2.000000e+00 + %sub = fsub double %mul, %mul1 + ret double %sub +} Index: test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -84,8 +84,8 @@ } ; GCN-LABEL: {{^}}fmul_x2_xn2_f32: -; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0 -; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] +; GCN: v_add_f32_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[X]] +; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[TMP0]], [[TMP0]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 @@ -200,8 +200,8 @@ } ; GCN-LABEL: {{^}}fmul_x2_xn2_f16: -; GCN: v_mul_f16_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0 -; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] +; GCN: v_add_f16_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[X]] +; GCN: v_mul_f16_e64 [[RESULT:v[0-9]+]], -[[TMP0]], [[TMP0]] ; GCN: buffer_store_short [[RESULT]] define amdgpu_kernel void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half Index: test/CodeGen/AMDGPU/fmuladd.f32.ll =================================================================== --- test/CodeGen/AMDGPU/fmuladd.f32.ll +++ test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -191,8 +191,8 @@ ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] -; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -251,8 +251,8 @@ ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] -; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] Index: test/CodeGen/X86/fmul-combines.ll =================================================================== --- test/CodeGen/X86/fmul-combines.ll +++ test/CodeGen/X86/fmul-combines.ll @@ -17,6 +17,15 @@ ret <4 x float> %y } +; CHECK-LABEL: fmulneg2_v4f32: +; CHECK: addps %xmm0, %xmm0 +; CHECK: xorps +; CHECK-NEXT: retq +define <4 x float> @fmulneg2_v4f32(<4 x float> %x) { + %y = fmul <4 x float> %x, + ret <4 x float> %y +} + ; CHECK-LABEL: constant_fold_fmul_v4f32: ; CHECK: movaps ; CHECK-NEXT: ret