diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -35,10 +35,6 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; -def gi_vop3mods_nnan : - GIComplexOperandMatcher, - GIComplexPatternEquiv; - def gi_vop3omods : GIComplexOperandMatcher, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -212,7 +212,6 @@ SDValue &Offset) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; - bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods, bool AllowAbs = true) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2629,12 +2629,6 @@ return false; } -bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, - SDValue &SrcMods) const { - SelectVOP3Mods(In, Src, SrcMods); - return CurDAG->isKnownNeverNaN(Src); -} - bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -173,9 +173,6 @@ ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const; - InstructionSelector::ComplexRendererFns - selectVOP3Mods_nnan(MachineOperand &Root) const; - std::pair selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI, bool IsDOT = false) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3721,22 +3721,6 @@ }}; } -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { - Register Src; - unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root); - if (!isKnownNeverNaN(Src, *MRI)) - return std::nullopt; - - return {{ - [=](MachineInstrBuilder &MIB) { - MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); - }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods - }}; -} - InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { Register Src; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -809,6 +809,25 @@ (fmaxnum node:$src0, node:$src1)] >; +class NeverNaNPats frags> : PatFrags { + let PredicateCode = [{ + return CurDAG->isKnownNeverNaN(SDValue(N,0)); + }]; + let GISelPredicateCode = [{ + return isKnownNeverNaN(MI.getOperand(0).getReg(), MRI); + }]; +} + +def fminnum_like_nnan : NeverNaNPats<(ops node:$src0, node:$src1), + [(fminnum_ieee node:$src0, node:$src1), + (fminnum node:$src0, node:$src1)] +>; + +def fmaxnum_like_nnan : NeverNaNPats<(ops node:$src0, node:$src1), + [(fmaxnum_ieee node:$src0, node:$src1), + (fmaxnum node:$src0, node:$src1)] +>; + def fminnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1), [(fminnum_ieee_oneuse node:$src0, node:$src1), (fminnum_oneuse node:$src0, node:$src1)] diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1488,8 +1488,6 @@ def VOP3Mods0 : ComplexPattern; def VOP3Mods : ComplexPattern; def VOP3NoMods : ComplexPattern; -// VOP3Mods, but the input source is known to never be NaN. -def VOP3Mods_nnan : ComplexPattern; def VOP3OMods : ComplexPattern; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3228,26 +3228,25 @@ defm : IntMed3Pat; defm : IntMed3Pat; -// This matches 16 permutations of -// max(min(x, y), min(max(x, y), z)) +// This matches 16 permutations of max(min(x, y), min(max(x, y), z)) class FPMed3Pat : GCNPat< - (fmaxnum_like (fminnum_like (VOP3Mods_nnan vt:$src0, i32:$src0_mods), - (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), - (fminnum_like (fmaxnum_like (VOP3Mods_nnan vt:$src0, i32:$src0_mods), - (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), - (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), - (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) ->; + (fmaxnum_like_nnan + (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods), + (VOP3Mods vt:$src1, i32:$src1_mods)), + (fminnum_like (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods), + (VOP3Mods vt:$src1, i32:$src1_mods)), + (vt (VOP3Mods vt:$src2, i32:$src2_mods)))), + (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, + DSTCLAMP.NONE, DSTOMOD.NONE)>; class FP16Med3Pat : GCNPat< - (fmaxnum_like (fminnum_like (VOP3Mods_nnan vt:$src0, i32:$src0_mods), - (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), - (fminnum_like (fmaxnum_like (VOP3Mods_nnan vt:$src0, i32:$src0_mods), - (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), - (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), + (fmaxnum_like_nnan (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods), + (VOP3Mods vt:$src1, i32:$src1_mods)), + (fminnum_like (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods), + (VOP3Mods vt:$src1, i32:$src1_mods)), + (vt (VOP3Mods vt:$src2, i32:$src2_mods)))), (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE) >; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -427,10 +427,7 @@ ; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; SI-NEXT: v_min_f32_e32 v5, v2, v3 -; SI-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-NEXT: v_min_f32_e32 v2, v2, v4 -; SI-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -450,23 +447,20 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dword v7, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] glc +; VI-NEXT: flat_load_dword v3, v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mul_f32_e32 v4, -1.0, v7 -; VI-NEXT: v_min_f32_e32 v5, v4, v2 -; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-NEXT: v_med3_f32 v2, v4, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -482,10 +476,7 @@ ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -501,10 +492,7 @@ ; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 -; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX10-NEXT: v_min_f32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -520,10 +508,8 @@ ; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX11-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX11-NEXT: v_minmax_f32 v1, v1, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -314,6 +314,73 @@ ret void } + +; GCN-LABEL: {{^}}v_nnan_input_calls_med3_f32_pat0: +; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] +define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_nnan_call_med3_f32_pat0: +; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] +define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_fast_call_med3_f32_pat0: +; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]] +; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]] +; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] +define amdgpu_kernel void @v_fast_call_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load volatile float, float addrspace(1)* %gep0 + %b = load volatile float, float addrspace(1)* %gep1 + %c = load volatile float, float addrspace(1)* %gep2 + %tmp0 = call fast float @llvm.minnum.f32(float %a, float %b) + %tmp1 = call fast float @llvm.maxnum.f32(float %a, float %b) + %tmp2 = call fast float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call fast float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, float addrspace(1)* %outgep + ret void +} + ; 16 combinations ; 0: max(min(x, y), min(max(x, y), z))