Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2577,6 +2577,14 @@ if (Src.getOpcode() == ISD::FNEG) { Mods |= SISrcMods::NEG; Src = Src.getOperand(0); + } else if (Src.getOpcode() == ISD::FSUB) { + // Fold fsub [+-]0 into fneg. This may not have folded depending on the + // denormal mode, but we're implicitly canonicalizing in a source operand. + auto *LHS = dyn_cast(Src.getOperand(0)); + if (LHS && LHS->isZero()) { + Mods |= SISrcMods::NEG; + Src = Src.getOperand(1); + } } if (AllowAbs && Src.getOpcode() == ISD::FABS) { @@ -2677,6 +2685,7 @@ unsigned Mods = 0; Src = In; + // TODO: Handle G_FSUB 0 as fneg if (Src.getOpcode() == ISD::FNEG) { Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); Src = Src.getOperand(0); Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3496,6 +3496,15 @@ Src = MI->getOperand(1).getReg(); Mods |= SISrcMods::NEG; MI = getDefIgnoringCopies(Src, *MRI); + } else if (MI->getOpcode() == AMDGPU::G_FSUB) { + // Fold fsub [+-]0 into fneg. This may not have folded depending on the + // denormal mode, but we're implicitly canonicalizing in a source operand. + const ConstantFP *LHS = + getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI); + if (LHS && LHS->isZero()) { + Mods |= SISrcMods::NEG; + Src = MI->getOperand(2).getReg(); + } } if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) { @@ -3633,6 +3642,8 @@ MI = MRI.getVRegDef(Src); } + // TODO: Handle G_FSUB 0 as fneg + // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() Index: llvm/test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -2608,10 +2608,9 @@ } ; This expects denormal flushing, so can't turn this fmul into fneg -; TODO: Keeping this as fmul saves encoding size ; GCN-LABEL: {{^}}nnan_fmul_neg1_to_fneg: -; GCN: v_sub_f32_e32 [[TMP:v[0-9]+]], 0x80000000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1 +; GCN: s_waitcnt +; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1 define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 { %mul = fmul float %x, -1.0 %add = fmul nnan float %mul, %y @@ -2631,8 +2630,9 @@ ; know the source can't be an snan ; GCN-LABEL: {{^}}denorm_snan_fmul_neg1_to_fneg: -; GCN: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0 -; GCN: v_mul_f32_e32 v0, [[TMP]], v1 +; GCN: s_waitcnt +; GCN-NEXT: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0 +; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1 ; GCN-NEXT: s_setpc_b64 define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) { %canonical = fmul float %x, %x @@ -2642,9 +2642,9 @@ } ; GCN-LABEL: {{^}}flush_snan_fmul_neg1_to_fneg: -; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], 1.0, v0 -; GCN: v_sub_f32_e32 [[TMP1:v[0-9]+]], 0x80000000, [[TMP0]] -; GCN-NEXT: v_mul_f32_e32 v0, [[TMP1]], v1 +; GCN: s_waitcnt +; GCN-NEXT: v_mul_f32_e32 [[TMP:v[0-9]+]], 1.0, v0 +; GCN-NEXT: v_mul_f32_e64 v0, -[[TMP]], v1 define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 { %quiet = call float @llvm.canonicalize.f32(float %x) %mul = fmul float %quiet, -1.0 Index: llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2641,8 +2641,7 @@ ; GCN-LABEL: nnan_fmul_neg1_to_fneg: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %mul = fmul float %x, -1.0 %add = fmul nnan float %mul, %y @@ -2681,8 +2680,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %quiet = call float @llvm.canonicalize.f32(float %x) %mul = fmul float %quiet, -1.0 Index: llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll +++ llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll @@ -49,24 +49,36 @@ } define float @fold_f32_fsub_into_fneg_modifier_ieee_pos0(float %v0, float %v1) #0 { -; CHECK-LABEL: fold_f32_fsub_into_fneg_modifier_ieee_pos0: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v0, 0, v0 -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_ieee_pos0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_ieee_pos0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_sub_f32_e32 v0, 0, v0 +; GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-NEXT: s_setpc_b64 s[30:31] %sub = fsub float 0.0, %v0 %mul = fmul float %sub, %v1 ret float %mul } define float @fold_f32_fsub_into_fneg_modifier_daz_pos0(float %v0, float %v1) #1 { -; CHECK-LABEL: fold_f32_fsub_into_fneg_modifier_daz_pos0: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v0, 0, v0 -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_daz_pos0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_daz_pos0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_sub_f32_e32 v0, 0, v0 +; GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-NEXT: s_setpc_b64 s[30:31] %sub = fsub float 0.0, %v0 %mul = fmul float %sub, %v1 ret float %mul @@ -113,8 +125,7 @@ ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_daz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_daz: @@ -150,8 +161,7 @@ ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_daz_nsz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_daz_nsz: @@ -169,8 +179,7 @@ ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_dynamic: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_dynamic: @@ -188,8 +197,7 @@ ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_dynamic_nsz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_dynamic_nsz: @@ -228,10 +236,8 @@ ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_daz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; SDAG-NEXT: v_mul_f32_e64 v1, -v1, v3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_daz: @@ -272,10 +278,8 @@ ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_daz_nsz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; SDAG-NEXT: v_mul_f32_e64 v1, -v1, v3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_daz_nsz: @@ -295,10 +299,8 @@ ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_dynamic: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; SDAG-NEXT: v_mul_f32_e64 v1, -v1, v3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_dynamic: @@ -318,10 +320,8 @@ ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_dynamic_nsz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; SDAG-NEXT: v_mul_f32_e64 v1, -v1, v3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_dynamic_nsz: @@ -360,8 +360,7 @@ ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_daz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f16_e32 v0, 0x8000, v0 -; SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f16_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_daz: @@ -397,8 +396,7 @@ ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_daz_nsz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f16_e32 v0, 0x8000, v0 -; SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f16_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_daz_nsz: @@ -416,8 +414,7 @@ ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_dynamic: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f16_e32 v0, 0x8000, v0 -; SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f16_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_dynamic: @@ -435,8 +432,7 @@ ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_dynamic_nsz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f16_e32 v0, 0x8000, v0 -; SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f16_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_dynamic_nsz: Index: llvm/test/CodeGen/AMDGPU/llvm.exp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -5793,8 +5793,8 @@ ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: v_rndne_f32_e32 v0, 0 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, 0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v1, 0x7fc00000, v1 +; VI-SDAG-NEXT: s_mov_b32 s4, 0x7fc00000 +; VI-SDAG-NEXT: v_add_f32_e64 v1, -v0, s4 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v1, v0