diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11879,7 +11879,8 @@ // fold (fsub x, (fma y, z, (fmul u, v))) // -> (fma (fneg y), z, (fma (fneg u), v, x)) if (CanFuse && N1.getOpcode() == PreferredFusedOpcode && - isContractableFMUL(N1.getOperand(2))) { + isContractableFMUL(N1.getOperand(2)) && + N1->hasOneUse()) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -11894,7 +11895,8 @@ // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) - if (N0.getOpcode() == PreferredFusedOpcode) { + if (N0.getOpcode() == PreferredFusedOpcode && + N0->hasOneUse()) { SDValue N02 = N0.getOperand(2); if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); @@ -11946,7 +11948,8 @@ // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) if (N1.getOpcode() == PreferredFusedOpcode && - N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) { + N1.getOperand(2).getOpcode() == ISD::FP_EXTEND && + N1->hasOneUse()) { SDValue N120 = N1.getOperand(2).getOperand(0); if (isContractableFMUL(N120) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll --- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -219,7 +219,7 @@ ret void } -; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd: +; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs: ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] ; GCN: buffer_load_dword [[Z:v[0-9]+]] @@ -241,7 +241,7 @@ ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] ; GCN-SLOWFMA: v_add_f32_e32 ; GCN-SLOWFMA: v_sub_f32_e32 -define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 { +define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef %z = load volatile float, float addrspace(1)* undef @@ -255,6 +255,120 @@ ret void } +; CHECK: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs: +; GCN: buffer_load_dword [[X:v[0-9]+]] +; GCN: buffer_load_dword [[Y:v[0-9]+]] +; GCN: buffer_load_dword [[Z:v[0-9]+]] +; GCN: buffer_load_dword [[U:v[0-9]+]] +; GCN: buffer_load_dword [[V:v[0-9]+]] + +; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] + +; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] +; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]] +; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]] +; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]] + +; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]] +; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]] +; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]] +; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]] + +; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] +; GCN-SLOWFMA: v_add_f32_e32 +; GCN-SLOWFMA: v_sub_f32_e32 +define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs() #0 { + %x = load volatile float, float addrspace(1)* undef + %y = load volatile float, float addrspace(1)* undef + %z = load volatile float, float addrspace(1)* undef + %u = load volatile float, float addrspace(1)* undef + %v = load volatile float, float addrspace(1)* undef + %mul.u.v = fmul fast float %u, %v + %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + %add = fsub fast float %z, %fma + store volatile float %fma, float addrspace(1)* undef + store volatile float %add, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs: +; GCN: buffer_load_dword [[X:v[0-9]+]] +; GCN: buffer_load_dword [[Y:v[0-9]+]] +; GCN: buffer_load_dword [[Z:v[0-9]+]] +; GCN: buffer_load_ushort [[U:v[0-9]+]] +; GCN: buffer_load_ushort [[V:v[0-9]+]] + +; GCN-DAG: v_cvt_f32_f16_e32 [[UFLOAT:v[0-9]+]], [[U]] +; GCN-DAG: v_cvt_f32_f16_e32 [[VFLOAT:v[0-9]+]], [[V]] +; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[UFLOAT]], [[VFLOAT]] + +; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] +; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]] +; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]] +; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]] + +; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]] +; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]] +; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]] +; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]] + +; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] +; GCN-SLOWFMA: v_add_f32_e32 +; GCN-SLOWFMA: v_sub_f32_e32 +define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() #0 { + %x = load volatile float, float addrspace(1)* undef + %y = load volatile float, float addrspace(1)* undef + %z = load volatile float, float addrspace(1)* undef + %u = load volatile half, half addrspace(1)* undef + %v = load volatile half, half addrspace(1)* undef + %mul.u.v.half = fmul fast half %u, %v + %mul.u.v = fpext half %mul.u.v.half to float + %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + %add = fsub fast float %fma, %z + store volatile float %fma, float addrspace(1)* undef + store volatile float %add, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs: +; GCN: buffer_load_dword [[X:v[0-9]+]] +; GCN: buffer_load_dword [[Y:v[0-9]+]] +; GCN: buffer_load_dword [[Z:v[0-9]+]] +; GCN: buffer_load_ushort [[U:v[0-9]+]] +; GCN: buffer_load_ushort [[V:v[0-9]+]] + +; GCN-DAG: v_cvt_f32_f16_e32 [[UFLOAT:v[0-9]+]], [[U]] +; GCN-DAG: v_cvt_f32_f16_e32 [[VFLOAT:v[0-9]+]], [[V]] +; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[UFLOAT]], [[VFLOAT]] + +; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] +; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]] +; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]] +; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]] + +; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]] +; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]] +; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]] +; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]] + +; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] +; GCN-SLOWFMA: v_add_f32_e32 +; GCN-SLOWFMA: v_sub_f32_e32 +define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs() #0 { + %x = load volatile float, float addrspace(1)* undef + %y = load volatile float, float addrspace(1)* undef + %z = load volatile float, float addrspace(1)* undef + %u = load volatile half, half addrspace(1)* undef + %v = load volatile half, half addrspace(1)* undef + %mul.u.v.half = fmul fast half %u, %v + %mul.u.v = fpext half %mul.u.v.half to float + %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) + %add = fsub fast float %z, %fma + store volatile float %fma, float addrspace(1)* undef + store volatile float %add, float addrspace(1)* undef + ret void +} + declare float @llvm.fma.f32(float, float, float) #1 declare float @llvm.fmuladd.f32(float, float, float) #1