Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -43,7 +43,9 @@ (match (wip_match_opcode G_SMAX, G_SMIN, G_UMAX, - G_UMIN):$min_or_max, + G_UMIN, + G_FMINNUM, + G_FMAXNUM):$min_or_max, [{ return matchMinMaxToMed3(*${min_or_max}, MRI, ${matchinfo}); }]), (apply [{ applyMinMaxToMed3(*${min_or_max}, ${matchinfo}); }])>; Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -212,6 +212,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; class GISelSop2Pat < SDPatternOperator node, Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" @@ -39,7 +40,7 @@ /// Returns true and stores \p MI in \p Cst if it represents constant. bool isConst(MIPtr MI, MachineRegisterInfo &MRI, MIPtr &Cst) { unsigned Opc = MI->getOpcode(); - if (Opc == AMDGPU::G_CONSTANT) { + if (Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT) { Cst = MI; return true; } @@ -98,6 +99,9 @@ case AMDGPU::G_UMAX: case AMDGPU::G_UMIN: return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3}; + case AMDGPU::G_FMINNUM: + case AMDGPU::G_FMAXNUM: + return {AMDGPU::G_FMINNUM, AMDGPU::G_FMAXNUM, AMDGPU::G_AMDGPU_FMED3}; } } @@ -158,6 +162,32 @@ return true; } + if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_FMED3) { + const APFloat &KO_FPImm = K0->getOperand(1).getFPImm()->getValue(); + const APFloat &K1_FPImm = K1->getOperand(1).getFPImm()->getValue(); + if (KO_FPImm >= K1_FPImm) + return false; + + const MachineFunction *MF = MI.getMF(); + const SIMachineFunctionInfo *Info = MF->getInfo(); + + // TODO: Add G_FMINNUM_IEEE (requires some additional checks for possible + // SNaN input). + if (OpcodeTriple.Min == AMDGPU::G_FMINNUM) { + assert(!Info->getMode().IEEE && "G_FMINNUM was supposed to be lowered to " + "G_FMINNUM_IEEE with IEEE=true"); + + const SIInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + APInt KObits = KO_FPImm.bitcastToAPInt(); + APInt K1bits = K1_FPImm.bitcastToAPInt(); + if ((!MRI.hasOneNonDBGUse(K0Def) || TII->isInlineConstant(KObits)) && + (!MRI.hasOneNonDBGUse(K1Def) || TII->isInlineConstant(K1bits))) { + MatchInfo = {OpcodeTriple.Med, ValDef, K0Def, K1Def}; + return true; + } + } + } + return false; } Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3642,6 +3642,7 @@ case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: case AMDGPU::G_AMDGPU_SMED3: case AMDGPU::G_AMDGPU_UMED3: + case AMDGPU::G_AMDGPU_FMED3: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2693,3 +2693,9 @@ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); let hasSideEffects = 0; } + +def G_AMDGPU_FMED3 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); + let hasSideEffects = 0; +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -643,6 +643,97 @@ ret void } +define amdgpu_ps float @test_min_max_ValK0_K1_u32(float %a) { +; GCN-LABEL: test_min_max_ValK0_K1_u32: +; GCN: ; %bb.0: +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: ; return to shader part epilog + %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0) + %fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0) + ret float %fmed +} + +define amdgpu_ps float @test_non_inline_const(float %a) { +; GCN-LABEL: test_non_inline_const: +; GCN: ; %bb.0: +; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_min_f32_e32 v0, 0x41000000, v0 +; GCN-NEXT: ; return to shader part epilog + %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0) + %fmed = call float @llvm.minnum.f32(float %maxnum, float 8.0) + ret float %fmed +} + +define amdgpu_ps float @min_max_ValK0_K1_float(float %a) { +; GCN-LABEL: min_max_ValK0_K1_float: +; GCN: ; %bb.0: +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: ; return to shader part epilog + %maxnum = call float @llvm.maxnum.f32(float 2.0, float %a) + %fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0) + ret float %fmed +} + +define amdgpu_ps float @test_min_K1max_ValK0__u32(float %a) { +; GCN-LABEL: test_min_K1max_ValK0__u32: +; GCN: ; %bb.0: +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: ; return to shader part epilog + %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0) + %fmed = call float @llvm.minnum.f32(float 4.0, float %maxnum) + ret float %fmed +} + +define amdgpu_ps float @test_min_K1max_K0Val__u32(float %a) { +; GCN-LABEL: test_min_K1max_K0Val__u32: +; GCN: ; %bb.0: +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: ; return to shader part epilog + %maxnum = call float @llvm.maxnum.f32(float 2.0, float %a) + %fmed = call float @llvm.minnum.f32(float 4.0, float %maxnum) + ret float %fmed +} + +define amdgpu_ps float @test_max_min_ValK1_K0_u32(float %a) { +; GCN-LABEL: test_max_min_ValK1_K0_u32: +; GCN: ; %bb.0: +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: ; return to shader part epilog + %minnum = call float @llvm.minnum.f32(float %a, float 4.0) + %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0) + ret float %fmed +} + +define amdgpu_ps float @test_max_min_K1Val_K0_u32(float %a) { +; GCN-LABEL: test_max_min_K1Val_K0_u32: +; GCN: ; %bb.0: +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: ; return to shader part epilog + %minnum = call float @llvm.minnum.f32(float 4.0, float %a) + %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0) + ret float %fmed +} + +define amdgpu_ps float @test_max_K0min_ValK1__u32(float %a) { +; GCN-LABEL: test_max_K0min_ValK1__u32: +; GCN: ; %bb.0: +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: ; return to shader part epilog + %minnum = call float @llvm.minnum.f32(float %a, float 4.0) + %fmed = call float @llvm.maxnum.f32(float 2.0, float %minnum) + ret float %fmed +} + +define amdgpu_ps float @test_max_K0min_K1Val__u32(float %a) { +; GCN-LABEL: test_max_K0min_K1Val__u32: +; GCN: ; %bb.0: +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: ; return to shader part epilog + %minnum = call float @llvm.minnum.f32(float 4.0, float %a) + %fmed = call float @llvm.maxnum.f32(float 2.0, float %minnum) + ret float %fmed +} + declare i32 @llvm.amdgcn.workitem.id.x() #0 declare float @llvm.fabs.f32(float) #0 declare float @llvm.minnum.f32(float, float) #0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-fmed3.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-fmed3.mir @@ -0,0 +1,255 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_min_max_ValK0_K1_u32 +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false +body: | + bb.1: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_min_max_ValK0_K1_u32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00 + ; CHECK: [[AMDGPU_FMED3_:%[0-9]+]]:_(s32) = G_AMDGPU_FMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %3:_(s32) = G_FCONSTANT float 2.000000e+00 + %4:_(s32) = G_FMAXNUM %0, %3 + %5:_(s32) = G_FCONSTANT float 4.000000e+00 + %6:_(s32) = G_FMINNUM %4, %5 + $vgpr0 = COPY %6(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: test_non_inline_const +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false +body: | + bb.1: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_non_inline_const + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK: [[FMAXNUM:%[0-9]+]]:_(s32) = G_FMAXNUM [[COPY]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 8.000000e+00 + ; CHECK: [[FMINNUM:%[0-9]+]]:_(s32) = G_FMINNUM [[FMAXNUM]], [[C1]] + ; CHECK: $vgpr0 = COPY [[FMINNUM]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %3:_(s32) = G_FCONSTANT float 2.000000e+00 + %4:_(s32) = G_FMAXNUM %0, %3 + %5:_(s32) = G_FCONSTANT float 8.000000e+00 + %6:_(s32) = G_FMINNUM %4, %5 + $vgpr0 = COPY %6(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: min_max_ValK0_K1_float +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false +body: | + bb.1: + liveins: $vgpr0 + + ; CHECK-LABEL: name: min_max_ValK0_K1_float + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00 + ; CHECK: [[AMDGPU_FMED3_:%[0-9]+]]:_(s32) = G_AMDGPU_FMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %3:_(s32) = G_FCONSTANT float 2.000000e+00 + %4:_(s32) = G_FMAXNUM %3, %0 + %5:_(s32) = G_FCONSTANT float 4.000000e+00 + %6:_(s32) = G_FMINNUM %4, %5 + $vgpr0 = COPY %6(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: test_min_K1max_ValK0__u32 +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false +body: | + bb.1: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_min_K1max_ValK0__u32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00 + ; CHECK: [[AMDGPU_FMED3_:%[0-9]+]]:_(s32) = G_AMDGPU_FMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %3:_(s32) = G_FCONSTANT float 2.000000e+00 + %4:_(s32) = G_FMAXNUM %0, %3 + %5:_(s32) = G_FCONSTANT float 4.000000e+00 + %6:_(s32) = G_FMINNUM %5, %4 + $vgpr0 = COPY %6(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: test_min_K1max_K0Val__u32 +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false +body: | + bb.1: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_min_K1max_K0Val__u32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00 + ; CHECK: [[AMDGPU_FMED3_:%[0-9]+]]:_(s32) = G_AMDGPU_FMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %3:_(s32) = G_FCONSTANT float 2.000000e+00 + %4:_(s32) = G_FMAXNUM %3, %0 + %5:_(s32) = G_FCONSTANT float 4.000000e+00 + %6:_(s32) = G_FMINNUM %5, %4 + $vgpr0 = COPY %6(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: test_max_min_ValK1_K0_u32 +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false +body: | + bb.1: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_max_min_ValK1_K0_u32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK: [[AMDGPU_FMED3_:%[0-9]+]]:_(s32) = G_AMDGPU_FMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %3:_(s32) = G_FCONSTANT float 4.000000e+00 + %4:_(s32) = G_FMINNUM %0, %3 + %5:_(s32) = G_FCONSTANT float 2.000000e+00 + %6:_(s32) = G_FMAXNUM %4, %5 + $vgpr0 = COPY %6(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: test_max_min_K1Val_K0_u32 +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false +body: | + bb.1: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_max_min_K1Val_K0_u32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK: [[AMDGPU_FMED3_:%[0-9]+]]:_(s32) = G_AMDGPU_FMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %3:_(s32) = G_FCONSTANT float 4.000000e+00 + %4:_(s32) = G_FMINNUM %3, %0 + %5:_(s32) = G_FCONSTANT float 2.000000e+00 + %6:_(s32) = G_FMAXNUM %4, %5 + $vgpr0 = COPY %6(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: test_max_K0min_ValK1__u32 +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false +body: | + bb.1: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_max_K0min_ValK1__u32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK: [[AMDGPU_FMED3_:%[0-9]+]]:_(s32) = G_AMDGPU_FMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %3:_(s32) = G_FCONSTANT float 4.000000e+00 + %4:_(s32) = G_FMINNUM %0, %3 + %5:_(s32) = G_FCONSTANT float 2.000000e+00 + %6:_(s32) = G_FMAXNUM %5, %4 + $vgpr0 = COPY %6(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: test_max_K0min_K1Val__u32 +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false +body: | + bb.1: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_max_K0min_K1Val__u32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK: [[AMDGPU_FMED3_:%[0-9]+]]:_(s32) = G_AMDGPU_FMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %3:_(s32) = G_FCONSTANT float 4.000000e+00 + %4:_(s32) = G_FMINNUM %3, %0 + %5:_(s32) = G_FCONSTANT float 2.000000e+00 + %6:_(s32) = G_FMAXNUM %5, %4 + $vgpr0 = COPY %6(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +...