Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2787,6 +2787,23 @@ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; } + case ISD::FMUL: { + // (fneg (fmul x, y)) -> (fmul x, (fneg y)) + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + + if (LHS.getOpcode() == ISD::FNEG) + LHS = LHS.getOperand(0); + else if (RHS.getOpcode() == ISD::FNEG) + RHS = RHS.getOperand(0); + else + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + + SDValue Res = DAG.getNode(ISD::FMUL, SL, VT, LHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } default: return SDValue(); } Index: test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -19,8 +19,8 @@ ; VI: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}| ; VI: v_cndmask_b32_e32 ; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| -; VI: v_mul_f32_e32 -; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 +; VI: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; VI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0 define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { %a11 = fadd fast float %y, -1.0 %a12 = call float @llvm.fabs.f32(float %a11) @@ -114,8 +114,8 @@ ; VI: v_cmp_gt_f16_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}| ; VI: v_cndmask_b32_e32 ; VI: v_add_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| -; VI: v_mul_f16_e32 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 +; VI: v_mul_f16_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0 define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half Index: test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-combines.ll +++ test/CodeGen/AMDGPU/fneg-combines.ll @@ -173,6 +173,183 @@ ret void } +; -------------------------------------------------------------------------------- +; fmul tests +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_fneg_mul_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] +; GCN-NEXT: buffer_store_dword [[RESULT]] +define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.000000e+00, %mul + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]] +; GCN-NEXT: buffer_store_dword [[NEG_MUL]] +; GCN: buffer_store_dword [[ADD]] +define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.000000e+00, %mul + store volatile float %fneg, float addrspace(1)* %out + store volatile float %mul, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] +; GCN-NEXT: buffer_store_dword [[NEG_MUL]] +; GCN: buffer_store_dword [[MUL]] +define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.000000e+00, %mul + %use1 = fmul float %mul, 4.0 + store volatile float %fneg, float addrspace(1)* %out + store volatile float %use1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NEXT: buffer_store_dword [[ADD]] +define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fneg.a = fsub float -0.000000e+00, %a + %mul = fmul float %fneg.a, %b + %fneg = fsub float -0.000000e+00, %mul + store volatile float %fneg, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NEXT: buffer_store_dword [[ADD]] +define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fneg.b = fsub float -0.000000e+00, %b + %mul = fmul float %a, %fneg.b + %fneg = fsub float -0.000000e+00, %mul + store volatile float %fneg, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] +; GCN-NEXT: buffer_store_dword [[ADD]] +define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fneg.a = fsub float -0.000000e+00, %a + %fneg.b = fsub float -0.000000e+00, %b + %mul = fmul float %fneg.a, %fneg.b + %fneg = fsub float -0.000000e+00, %mul + store volatile float %fneg, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] +; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]] +; GCN-NEXT: buffer_store_dword [[NEG_MUL]] +; GCN: buffer_store_dword [[NEG_A]] +define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fneg.a = fsub float -0.000000e+00, %a + %mul = fmul float %fneg.a, %b + %fneg = fsub float -0.000000e+00, %mul + store volatile float %fneg, float addrspace(1)* %out + store volatile float %fneg.a, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} +; GCN-NEXT: buffer_store_dword [[NEG_MUL]] +; GCN: buffer_store_dword [[MUL]] +define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fneg.a = fsub float -0.000000e+00, %a + %mul = fmul float %fneg.a, %b + %fneg = fsub float -0.000000e+00, %mul + %use1 = fmul float %fneg.a, %c + store volatile float %fneg, float addrspace(1)* %out + store volatile float %use1, float addrspace(1)* %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/mad-combine.ll =================================================================== --- test/CodeGen/AMDGPU/mad-combine.ll +++ test/CodeGen/AMDGPU/mad-combine.ll @@ -273,12 +273,12 @@ ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] ; SI: buffer_store_dword [[RESULT]] define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { @@ -306,15 +306,15 @@ ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]] ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}