Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -31,6 +31,10 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_vop3modsnoncanonicalizing : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_vop3_no_mods : GIComplexOperandMatcher, GIComplexPatternEquiv; Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -216,8 +216,11 @@ bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods, + bool IsCanonicalizing = true, bool AllowAbs = true) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3ModsNonCanonicalizing(SDValue In, SDValue &Src, + SDValue &SrcMods) const; bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2570,6 +2570,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &Mods, + bool IsCanonicalizing, bool AllowAbs) const { Mods = 0; Src = In; @@ -2577,6 +2578,14 @@ if (Src.getOpcode() == ISD::FNEG) { Mods |= SISrcMods::NEG; Src = Src.getOperand(0); + } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) { + // Fold fsub [+-]0 into fneg. This may not have folded depending on the + // denormal mode, but we're implicitly canonicalizing in a source operand. + auto *LHS = dyn_cast(Src.getOperand(0)); + if (LHS && LHS->isZero()) { + Mods |= SISrcMods::NEG; + Src = Src.getOperand(1); + } } if (AllowAbs && Src.getOpcode() == ISD::FABS) { @@ -2590,7 +2599,20 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods; - if (SelectVOP3ModsImpl(In, Src, Mods)) { + if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true, + /*AllowAbs=*/true)) { + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing( + SDValue In, SDValue &Src, SDValue &SrcMods) const { + unsigned Mods; + if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false, + /*AllowAbs=*/true)) { SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } @@ -2601,7 +2623,9 @@ bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods; - if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) { + if (SelectVOP3ModsImpl(In, Src, Mods, + /*IsCanonicalizing=*/true, + /*AllowAbs=*/false)) { SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } @@ -2621,7 +2645,9 @@ SDValue &SrcMods, bool OpSel) const { unsigned Mods; - if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) { + if (SelectVOP3ModsImpl(In, Src, Mods, + /*IsCanonicalizing=*/true, + /*AllowAbs=*/false)) { if (OpSel) Mods |= SISrcMods::OP_SEL_0; SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); @@ -2677,6 +2703,7 @@ unsigned Mods = 0; Src = In; + // TODO: Handle G_FSUB 0 as fneg if (Src.getOpcode() == ISD::FNEG) { Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); Src = Src.getOperand(0); Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -148,9 +148,10 @@ bool selectSMFMACIntrin(MachineInstr &I) const; bool selectWaveAddress(MachineInstr &I) const; - std::pair - selectVOP3ModsImpl(MachineOperand &Root, bool AllowAbs = true, - bool OpSel = false) const; + std::pair selectVOP3ModsImpl(MachineOperand &Root, + bool IsCanonicalizing = true, + bool AllowAbs = true, + bool OpSel = false) const; Register copyToVGPRIfSrcFolded(Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt, @@ -171,6 +172,8 @@ InstructionSelector::ComplexRendererFns selectVOP3Mods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectVOP3ModsNonCanonicalizing(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3BMods(MachineOperand &Root) const; ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3486,8 +3486,10 @@ } -std::pair AMDGPUInstructionSelector::selectVOP3ModsImpl( - MachineOperand &Root, bool AllowAbs, bool OpSel) const { +std::pair +AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, + bool IsCanonicalizing, + bool AllowAbs, bool OpSel) const { Register Src = Root.getReg(); unsigned Mods = 0; MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); @@ -3496,6 +3498,15 @@ Src = MI->getOperand(1).getReg(); Mods |= SISrcMods::NEG; MI = getDefIgnoringCopies(Src, *MRI); + } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) { + // Fold fsub [+-]0 into fneg. This may not have folded depending on the + // denormal mode, but we're implicitly canonicalizing in a source operand. + const ConstantFP *LHS = + getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI); + if (LHS && LHS->isZero()) { + Mods |= SISrcMods::NEG; + Src = MI->getOperand(2).getReg(); + } } if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) { @@ -3558,7 +3569,9 @@ AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + /*IsCanonicalizing=*/true, + /*AllowAbs=*/false); return {{ [=](MachineInstrBuilder &MIB) { @@ -3593,11 +3606,27 @@ }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing( + MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false); + + return {{ + [=](MachineInstrBuilder &MIB) { + MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true, + /*AllowAbs=*/false); return {{ [=](MachineInstrBuilder &MIB) { @@ -3633,6 +3662,8 @@ MI = MRI.getVRegDef(Src); } + // TODO: Handle G_FSUB 0 as fneg + // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() @@ -3719,8 +3750,9 @@ Register Src; unsigned Mods; std::tie(Src, Mods) = selectVOP3ModsImpl(Root, - /* AllowAbs */ false, - /* OpSel */ false); + /*IsCanonicalizing=*/true, + /*AllowAbs=*/false, + /*OpSel=*/false); return {{ [=](MachineInstrBuilder &MIB) { @@ -3736,8 +3768,9 @@ Register Src; unsigned Mods; std::tie(Src, Mods) = selectVOP3ModsImpl(Root, - /* AllowAbs */ false, - /* OpSel */ true); + /*IsCanonicalizing=*/true, + /*AllowAbs=*/false, + /*OpSel=*/true); return {{ [=](MachineInstrBuilder &MIB) { Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1340,7 +1340,16 @@ def MOVRELOffset : ComplexPattern; def VOP3Mods0 : ComplexPattern; + +// Modifiers for floating point instructions. def VOP3Mods : ComplexPattern; + +// VOP3 modifiers used for instructions that do not read canonicalized +// floating point values (i.e. integer operations with FP source +// modifiers) +def VOP3ModsNonCanonicalizing : ComplexPattern; + def VOP3NoMods : ComplexPattern; def VOP3OMods : ComplexPattern; Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1128,8 +1128,8 @@ >; class VOPSelectModsPat : GCNPat < - (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods), - (VOP3Mods vt:$src2, i32:$src2_mods))), + (vt (select i1:$src0, (VOP3ModsNonCanonicalizing vt:$src1, i32:$src1_mods), + (VOP3ModsNonCanonicalizing vt:$src2, i32:$src2_mods))), (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2, FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0) >; Index: llvm/lib/Target/AMDGPU/VOPCInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -831,7 +831,7 @@ list ret = [(set i1:$sdst, (AMDGPUfp_class - (P.Src0VT (VOP3Mods P.Src0VT:$src0, i32:$src0_modifiers)), + (P.Src0VT (VOP3ModsNonCanonicalizing P.Src0VT:$src0, i32:$src0_modifiers)), i32:$src1))]; } Index: llvm/lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOPInstructions.td +++ llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1487,7 +1487,7 @@ include "VOPDInstructions.td" class ClassPat : GCNPat < - (is_fpclass (vt (VOP3Mods vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), + (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), (inst i32:$src0_mods, vt:$src0, (V_MOV_B32_e32 timm:$mask)) >; Index: llvm/test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -2608,10 +2608,9 @@ } ; This expects denormal flushing, so can't turn this fmul into fneg -; TODO: Keeping this as fmul saves encoding size ; GCN-LABEL: {{^}}nnan_fmul_neg1_to_fneg: -; GCN: v_sub_f32_e32 [[TMP:v[0-9]+]], 0x80000000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1 +; GCN: s_waitcnt +; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1 define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 { %mul = fmul float %x, -1.0 %add = fmul nnan float %mul, %y @@ -2631,8 +2630,9 @@ ; know the source can't be an snan ; GCN-LABEL: {{^}}denorm_snan_fmul_neg1_to_fneg: -; GCN: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0 -; GCN: v_mul_f32_e32 v0, [[TMP]], v1 +; GCN: s_waitcnt +; GCN-NEXT: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0 +; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1 ; GCN-NEXT: s_setpc_b64 define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) { %canonical = fmul float %x, %x @@ -2642,9 +2642,9 @@ } ; GCN-LABEL: {{^}}flush_snan_fmul_neg1_to_fneg: -; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], 1.0, v0 -; GCN: v_sub_f32_e32 [[TMP1:v[0-9]+]], 0x80000000, [[TMP0]] -; GCN-NEXT: v_mul_f32_e32 v0, [[TMP1]], v1 +; GCN: s_waitcnt +; GCN-NEXT: v_mul_f32_e32 [[TMP:v[0-9]+]], 1.0, v0 +; GCN-NEXT: v_mul_f32_e64 v0, -[[TMP]], v1 define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 { %quiet = call float @llvm.canonicalize.f32(float %x) %mul = fmul float %quiet, -1.0 Index: llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2641,8 +2641,7 @@ ; GCN-LABEL: nnan_fmul_neg1_to_fneg: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %mul = fmul float %x, -1.0 %add = fmul nnan float %mul, %y @@ -2681,8 +2680,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %quiet = call float @llvm.canonicalize.f32(float %x) %mul = fmul float %quiet, -1.0 Index: llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll +++ llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll @@ -49,24 +49,36 @@ } define float @fold_f32_fsub_into_fneg_modifier_ieee_pos0(float %v0, float %v1) #0 { -; CHECK-LABEL: fold_f32_fsub_into_fneg_modifier_ieee_pos0: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v0, 0, v0 -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_ieee_pos0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_ieee_pos0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_sub_f32_e32 v0, 0, v0 +; GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-NEXT: s_setpc_b64 s[30:31] %sub = fsub float 0.0, %v0 %mul = fmul float %sub, %v1 ret float %mul } define float @fold_f32_fsub_into_fneg_modifier_daz_pos0(float %v0, float %v1) #1 { -; CHECK-LABEL: fold_f32_fsub_into_fneg_modifier_daz_pos0: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v0, 0, v0 -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_daz_pos0: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_daz_pos0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_sub_f32_e32 v0, 0, v0 +; GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-NEXT: s_setpc_b64 s[30:31] %sub = fsub float 0.0, %v0 %mul = fmul float %sub, %v1 ret float %mul @@ -113,8 +125,7 @@ ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_daz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_daz: @@ -150,8 +161,7 @@ ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_daz_nsz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_daz_nsz: @@ -169,8 +179,7 @@ ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_dynamic: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_dynamic: @@ -188,8 +197,7 @@ ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_dynamic_nsz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_dynamic_nsz: @@ -228,10 +236,8 @@ ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_daz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; SDAG-NEXT: v_mul_f32_e64 v1, -v1, v3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_daz: @@ -272,10 +278,8 @@ ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_daz_nsz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; SDAG-NEXT: v_mul_f32_e64 v1, -v1, v3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_daz_nsz: @@ -295,10 +299,8 @@ ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_dynamic: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; SDAG-NEXT: v_mul_f32_e64 v1, -v1, v3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_dynamic: @@ -318,10 +320,8 @@ ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_dynamic_nsz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; SDAG-NEXT: v_mul_f32_e64 v1, -v1, v3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_dynamic_nsz: @@ -360,8 +360,7 @@ ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_daz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f16_e32 v0, 0x8000, v0 -; SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f16_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_daz: @@ -397,8 +396,7 @@ ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_daz_nsz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f16_e32 v0, 0x8000, v0 -; SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f16_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_daz_nsz: @@ -416,8 +414,7 @@ ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_dynamic: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f16_e32 v0, 0x8000, v0 -; SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f16_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_dynamic: @@ -435,8 +432,7 @@ ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_dynamic_nsz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f16_e32 v0, 0x8000, v0 -; SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; SDAG-NEXT: v_mul_f16_e64 v0, -v0, v1 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_dynamic_nsz: @@ -1288,10 +1284,9 @@ ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_interp_daz: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 ; SDAG-NEXT: s_mov_b32 m0, s4 ; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 -; SDAG-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y +; SDAG-NEXT: v_interp_p1ll_f16 v0, -v0, attr2.y ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_interp_daz: Index: llvm/test/CodeGen/AMDGPU/llvm.exp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -5793,8 +5793,8 @@ ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: v_rndne_f32_e32 v0, 0 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, 0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v1, 0x7fc00000, v1 +; VI-SDAG-NEXT: s_mov_b32 s4, 0x7fc00000 +; VI-SDAG-NEXT: v_add_f32_e64 v1, -v0, s4 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v1, v0