diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -73,6 +73,21 @@ [{ return RegBankHelper.matchFPMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>; +def fp_minmax_to_clamp : GICombineRule< + (defs root:$min_or_max, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_FMAXNUM, + G_FMINNUM, + G_FMAXNUM_IEEE, + G_FMINNUM_IEEE):$min_or_max, + [{ return RegBankHelper.matchFPMinMaxToClamp(*${min_or_max}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyClamp(*${min_or_max}, ${matchinfo}); }])>; + +def fmed3_intrinsic_to_clamp : GICombineRule< + (defs root:$fmed3, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_INTRINSIC):$fmed3, + [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>; + def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">; def remove_fcanonicalize : GICombineRule< @@ -113,7 +128,7 @@ def AMDGPURegBankCombinerHelper : GICombinerHelper< "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, - fp_minmax_to_med3]> { + fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; let StateClass = "AMDGPURegBankCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -173,6 +173,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-regbank-combiner" @@ -67,12 +68,18 @@ bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg); + bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg); void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + void applyClamp(MachineInstr &MI, Register &Reg); private: AMDGPU::SIModeRegisterDefaults getMode(); bool getIEEE(); + bool getDX10Clamp(); bool isFminnumIeee(const MachineInstr &MI); + bool isFCst(MachineInstr *MI); + bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1); }; bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) { @@ -167,19 +174,20 @@ // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) // ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K // ieee = false : min/max(NaN, K) = K +// clamp(NaN) = dx10_clamp ? 0.0 : NaN // Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input. // Other operand commutes (see matchMed) give same result since min and max are // commutative. // Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1 -// with fmed3(Val, K0, K1). +// with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0. // Val = SNaN only for ieee = true // fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1 // min(max(SNaN, K0), K1) = min(QNaN, K1) = K1 // max(min(SNaN, K1), K0) = max(K1, K0) = K1 // Val = NaN,ieee = false or Val = QNaN,ieee = true // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0 -// min(max(NaN, K0), K1) = min(K0, K1) = K0 +// min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true) // max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0 bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3( MachineInstr &MI, Med3MatchInfo &MatchInfo) { @@ -217,6 +225,92 @@ return false; } + +bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI, + Register &Reg) { + // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16). + auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); + Register Val; + Optional K0, K1; + // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). + if (!matchMed(MI, MRI, OpcodeTriple, Val, K0, K1)) + return false; + + if (!K0->Value.isExactlyValue(0.0) || !K1->Value.isExactlyValue(1.0)) + return false; + + // For IEEE=false perform combine only when it's safe to assume that there are + // no NaN inputs. Most often MI is marked with nnan fast math flag. + // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates + // to 0.0 requires dx10_clamp = true. + if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) && + isKnownNeverSNaN(Val, MRI)) || + isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) { + Reg = Val; + return true; + } + + return false; +} + +// Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true. +// Val = SNaN only for ieee = true. It is important which operand is NaN. +// min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0 +// min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0 +// min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN +// Val = NaN,ieee = false or Val = QNaN,ieee = true +// min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0 +// min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0 +// min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0 +bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI, + Register &Reg) { + if (MI.getIntrinsicID() != Intrinsic::amdgcn_fmed3) + return false; + + // In llvm-ir, clamp is often represented as an intrinsic call to + // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders. + MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); + MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI); + MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); + + if (isFCst(Src0) && !isFCst(Src1)) + std::swap(Src0, Src1); + if (isFCst(Src1) && !isFCst(Src2)) + std::swap(Src1, Src2); + if (isFCst(Src0) && !isFCst(Src1)) + std::swap(Src0, Src1); + if (!isClampZeroToOne(Src1, Src2)) + return false; + + Register Val = Src0->getOperand(0).getReg(); + + auto isOp3Zero = [&]() { + MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); + if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT) + return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0); + return false; + }; + // For IEEE=false perform combine only when it's safe to assume that there are + // no NaN inputs. Most often MI is marked with nnan fast math flag. + // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold + // when Val could be QNaN. If Val can also be SNaN third input should be 0.0. + if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) || + (getIEEE() && getDX10Clamp() && + (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) { + Reg = Val; + return true; + } + + return false; +} + +void AMDGPURegBankCombinerHelper::applyClamp(MachineInstr &MI, Register &Reg) { + B.setInstrAndDebugLoc(MI); + B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg}, + MI.getFlags()); + MI.eraseFromParent(); +} + void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) { B.setInstrAndDebugLoc(MI); @@ -233,10 +327,27 @@ bool AMDGPURegBankCombinerHelper::getIEEE() { return getMode().IEEE; } +bool AMDGPURegBankCombinerHelper::getDX10Clamp() { return getMode().DX10Clamp; } + bool AMDGPURegBankCombinerHelper::isFminnumIeee(const MachineInstr &MI) { return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE; } +bool AMDGPURegBankCombinerHelper::isFCst(MachineInstr *MI) { + return MI->getOpcode() == AMDGPU::G_FCONSTANT; +} + +bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr *K0, + MachineInstr *K1) { + if (isFCst(K0) && isFCst(K1)) { + const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm(); + const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm(); + return (KO_FPImm->isExactlyValue(0.0) && K1_FPImm->isExactlyValue(1.0)) || + (KO_FPImm->isExactlyValue(1.0) && K1_FPImm->isExactlyValue(0.0)); + } + return false; +} + class AMDGPURegBankCombinerHelperState { protected: CombinerHelper &Helper; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2872,6 +2872,12 @@ let hasSideEffects = 0; } +def G_AMDGPU_CLAMP : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector // operand Expects a MachineMemOperand in addition to explicit // operands. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define float @test_fmed3_f32_known_nnan_ieee_true(float %a) #0 { +; GFX10-LABEL: test_fmed3_f32_known_nnan_ieee_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul float %a, 2.0 + %fmed = call nnan float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0) + ret float %fmed +} + +define half @test_fmed3_f16_known_nnan_ieee_false(half %a) #1 { +; GFX10-LABEL: test_fmed3_f16_known_nnan_ieee_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul half %a, 2.0 + %fmed = call nnan half @llvm.amdgcn.fmed3.f16(half %fmul, half 0.0, half 1.0) + ret half %fmed +} + +; %fmin is known non-SNaN because fmin inputs are fcanonicalized +define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 { +; GFX10-LABEL: test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f32_e64 v0, 0x41200000, v0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmin = call float @llvm.minnum.f32(float %a, float 10.0) + %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmin, float 0.0, float 1.0) + ret float %fmed +} + +; input may be SNaN. It's safe to clamp since third operand in fmed3 is 0.0 +define float @test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true(float %a) #2 { +; GFX10-LABEL: test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul float %a, 2.0 + %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 1.0, float 0.0) + ret float %fmed +} + +; global nnan function attribute always forces clamp combine + +define float @test_fmed3_global_nnan(float %a) #3 { +; GFX10-LABEL: test_fmed3_global_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul float %a, 2.0 + %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0) + ret float %fmed +} + +; ------------------------------------------------------------------------------ +; Negative patterns +; ------------------------------------------------------------------------------ + +; ieee=false requires known never NaN input +define float @test_fmed3_f32_maybe_NaN_ieee_false(float %a) #1 { +; GFX10-LABEL: test_fmed3_f32_maybe_NaN_ieee_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX10-NEXT: v_med3_f32 v0, v0, 1.0, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul float %a, 2.0 + %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 1.0, float 0.0) + ret float %fmed +} + +; ieee=true input is known non-SNaN but dx10_clamp=false +define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false(float %a) #4 { +; GFX10-LABEL: test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, 0x41200000, v0 +; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmin = call float @llvm.minnum.f32(float %a, float 10.0) + %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmin, float 0.0, float 1.0) + ret float %fmed +} + +; ieee=true dx10_clamp=true but input may be SNaN, clamp requires third operand in fmed3 to be 0.0 +define float @test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 { +; GFX10-LABEL: test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul float %a, 2.0 + %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0) + ret float %fmed +} + +declare half @llvm.amdgcn.fmed3.f16(half, half, half) +declare float @llvm.amdgcn.fmed3.f32(float, float, float) +declare float @llvm.minnum.f32(float, float) + +attributes #0 = {"amdgpu-ieee"="true"} +attributes #1 = {"amdgpu-ieee"="false"} +attributes #2 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="true"} +attributes #3 = {"no-nans-fp-math"="true"} +attributes #4 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="false"} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll @@ -0,0 +1,262 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define float @test_min_max_ValK0_K1_f32(float %a) #0 { +; GFX10-LABEL: test_min_max_ValK0_K1_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul float %a, 2.0 + %maxnum = call nnan float @llvm.maxnum.f32(float %fmul, float 0.0) + %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 1.0) + ret float %fmed +} + +define double @test_min_max_K0Val_K1_f64(double %a) #1 { +; GFX10-LABEL: test_min_max_K0Val_K1_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], 2.0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul double %a, 2.0 + %maxnum = call nnan double @llvm.maxnum.f64(double 0.0, double %fmul) + %fmed = call nnan double @llvm.minnum.f64(double %maxnum, double 1.0) + ret double %fmed +} + +; min-max patterns for ieee=true, dx10_clamp=true don't have to check for NaNs +define half @test_min_K1max_ValK0_f16(half %a) #2 { +; GFX10-LABEL: test_min_K1max_ValK0_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul half %a, 2.0 + %maxnum = call half @llvm.maxnum.f16(half %fmul, half 0.0) + %fmed = call half @llvm.minnum.f16(half 1.0, half %maxnum) + ret half %fmed +} + +define <2 x half> @test_min_K1max_K0Val_f16(<2 x half> %a) #1 { +; GFX10-LABEL: test_min_K1max_K0Val_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul <2 x half> %a, + %maxnum = call nnan <2 x half> @llvm.maxnum.v2f16(<2 x half> , <2 x half> %fmul) + %fmed = call nnan <2 x half> @llvm.minnum.v2f16(<2 x half> , <2 x half> %maxnum) + ret <2 x half> %fmed +} + +define <2 x half> @test_min_max_splat_padded_with_undef(<2 x half> %a) #2 { +; GFX10-LABEL: test_min_max_splat_padded_with_undef: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul <2 x half> %a, + %maxnum = call <2 x half> @llvm.maxnum.v2f16(<2 x half> , <2 x half> %fmul) + %fmed = call <2 x half> @llvm.minnum.v2f16(<2 x half> , <2 x half> %maxnum) + ret <2 x half> %fmed +} + +; max-mix patterns work only for known non-NaN inputs + +define float @test_max_min_ValK1_K0_f32(float %a) #0 { +; GFX10-LABEL: test_max_min_ValK1_K0_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul float %a, 2.0 + %minnum = call nnan float @llvm.minnum.f32(float %fmul, float 1.0) + %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 0.0) + ret float %fmed +} + +define double @test_max_min_K1Val_K0_f64(double %a) #1 { +; GFX10-LABEL: test_max_min_K1Val_K0_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], 2.0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul double %a, 2.0 + %minnum = call nnan double @llvm.minnum.f64(double 1.0, double %fmul) + %fmed = call nnan double @llvm.maxnum.f64(double %minnum, double 0.0) + ret double %fmed +} + +define half @test_max_K0min_ValK1_f16(half %a) #0 { +; GFX10-LABEL: test_max_K0min_ValK1_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul half %a, 2.0 + %minnum = call nnan half @llvm.minnum.f16(half %fmul, half 1.0) + %fmed = call nnan half @llvm.maxnum.f16(half 0.0, half %minnum) + ret half %fmed +} + +; treat undef as value that will result in a constant splat +define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 { +; GFX10-LABEL: test_max_K0min_K1Val_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul <2 x half> %a, + %minnum = call nnan <2 x half> @llvm.minnum.v2f16(<2 x half> , <2 x half> %fmul) + %fmed = call nnan <2 x half> @llvm.maxnum.v2f16(<2 x half> , <2 x half> %minnum) + ret <2 x half> %fmed +} + +; global nnan function attribute always forces clamp combine + +define float @test_min_max_global_nnan(float %a) #3 { +; GFX10-LABEL: test_min_max_global_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f32_e64 v0, v0, v0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %maxnum = call float @llvm.maxnum.f32(float %a, float 0.0) + %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0) + ret float %fmed +} + +define float @test_max_min_global_nnan(float %a) #3 { +; GFX10-LABEL: test_max_min_global_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f32_e64 v0, v0, v0 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] + %minnum = call float @llvm.minnum.f32(float %a, float 1.0) + %fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0) + ret float %fmed +} + +; ------------------------------------------------------------------------------ +; Negative patterns +; ------------------------------------------------------------------------------ + +; min(max(Val, 1.0), 0.0), should be min(max(Val, 0.0), 1.0) +define float @test_min_max_K0_gt_K1(float %a) #0 { +; GFX10-LABEL: test_min_max_K0_gt_K1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f32_e32 v0, 1.0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, 0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 1.0) + %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 0.0) + ret float %fmed +} + +; max(min(Val, 0.0), 1.0), should be max(min(Val, 1.0), 0.0) +define float @test_max_min_K0_gt_K1(float %a) #0 { +; GFX10-LABEL: test_max_min_K0_gt_K1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_min_f32_e32 v0, 0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, 1.0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %minnum = call nnan float @llvm.minnum.f32(float %a, float 0.0) + %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 1.0) + ret float %fmed +} + +; Input that can be NaN + +; min-max patterns for ieee=false require known non-NaN input +define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 { +; GFX10-LABEL: test_min_max_maybe_NaN_input_ieee_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul float %a, 2.0 + %maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0) + %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0) + ret float %fmed +} + +; clamp fails here since input can be NaN and dx10_clamp=false; fmed3 succeds +define float @test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false(float %a) #4 { +; GFX10-LABEL: test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul float %a, 2.0 + %maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0) + %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0) + ret float %fmed +} + +; max-min patterns always require known non-NaN input + +define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 { +; GFX10-LABEL: test_max_min_maybe_NaN_input_ieee_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul float %a, 2.0 + %minnum = call float @llvm.minnum.f32(float %fmul, float 1.0) + %fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0) + ret float %fmed +} + +define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 { +; GFX10-LABEL: test_max_min_maybe_NaN_input_ieee_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %fmul = fmul float %a, 2.0 + %minnum = call float @llvm.minnum.f32(float %fmul, float 1.0) + %fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0) + ret float %fmed +} + +declare half @llvm.minnum.f16(half, half) +declare half @llvm.maxnum.f16(half, half) +declare float @llvm.minnum.f32(float, float) +declare float @llvm.maxnum.f32(float, float) +declare double @llvm.minnum.f64(double, double) +declare double @llvm.maxnum.f64(double, double) +declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) +declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) +attributes #0 = {"amdgpu-ieee"="true"} +attributes #1 = {"amdgpu-ieee"="false"} +attributes #2 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="true"} +attributes #3 = {"no-nans-fp-math"="true"} +attributes #4 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="false"} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir @@ -0,0 +1,261 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s +--- +name: test_fmed3_f32_known_nnan_ieee_true +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fmed3_f32_known_nnan_ieee_true + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + %8:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_FMUL %0, %8 + %6:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %5:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %9:vgpr(s32) = COPY %5(s32) + %10:vgpr(s32) = COPY %6(s32) + %4:vgpr(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %3(s32), %9(s32), %10(s32) + $vgpr0 = COPY %4(s32) +... + +--- +name: test_fmed3_f16_known_nnan_ieee_false +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fmed3_f16_known_nnan_ieee_false + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4000 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[C]](s16) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s16) = G_FMUL [[TRUNC]], [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s16) = nnan G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[AMDGPU_CLAMP]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + %2:vgpr(s32) = COPY $vgpr0 + %0:vgpr(s16) = G_TRUNC %2(s32) + %3:sgpr(s16) = G_FCONSTANT half 0xH4000 + %10:vgpr(s16) = COPY %3(s16) + %4:vgpr(s16) = G_FMUL %0, %10 + %7:sgpr(s16) = G_FCONSTANT half 0xH3C00 + %6:sgpr(s16) = G_FCONSTANT half 0xH0000 + %11:vgpr(s16) = COPY %6(s16) + %12:vgpr(s16) = COPY %7(s16) + %5:vgpr(s16) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %4(s16), %11(s16), %12(s16) + %9:vgpr(s32) = G_ANYEXT %5(s16) + $vgpr0 = COPY %9(s32) +... + +--- +name: test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+01 + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:vgpr(s32) = G_FCANONICALIZE [[COPY]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:vgpr(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMINNUM_IEEE]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 1.000000e+01 + %8:vgpr(s32) = G_FCANONICALIZE %0 + %9:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_FMINNUM_IEEE %8, %9 + %6:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %5:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %10:vgpr(s32) = COPY %5(s32) + %11:vgpr(s32) = COPY %6(s32) + %4:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %3(s32), %10(s32), %11(s32) + $vgpr0 = COPY %4(s32) +... + +--- +name: test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + %8:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_FMUL %0, %8 + %6:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %5:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %9:vgpr(s32) = COPY %5(s32) + %10:vgpr(s32) = COPY %6(s32) + %4:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %3(s32), %9(s32), %10(s32) + $vgpr0 = COPY %4(s32) +... + +# FixMe: add tests with attributes #3 = {"no-nans-fp-math"="true"} + +--- +name: test_fmed3_f32_maybe_NaN_ieee_false +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fmed3_f32_maybe_NaN_ieee_false + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), [[FMUL]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + %8:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_FMUL %0, %8 + %6:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %5:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %9:vgpr(s32) = COPY %5(s32) + %10:vgpr(s32) = COPY %6(s32) + %4:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %3(s32), %9(s32), %10(s32) + $vgpr0 = COPY %4(s32) +... + +--- +name: test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: false +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+01 + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:vgpr(s32) = G_FCANONICALIZE [[COPY]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:vgpr(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[COPY1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), [[FMINNUM_IEEE]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 1.000000e+01 + %8:vgpr(s32) = G_FCANONICALIZE %0 + %9:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_FMINNUM_IEEE %8, %9 + %6:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %5:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %10:vgpr(s32) = COPY %5(s32) + %11:vgpr(s32) = COPY %6(s32) + %4:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %3(s32), %10(s32), %11(s32) + $vgpr0 = COPY %4(s32) +... + +--- +name: test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), [[FMUL]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + %8:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_FMUL %0, %8 + %6:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %5:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %9:vgpr(s32) = COPY %5(s32) + %10:vgpr(s32) = COPY %6(s32) + %4:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %3(s32), %9(s32), %10(s32) + $vgpr0 = COPY %4(s32) +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir @@ -0,0 +1,583 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_min_max_ValK0_K1_f32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_min_max_ValK0_K1_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + %9:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_FMUL %0, %9 + %4:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %10:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = nnan G_FMAXNUM_IEEE %3, %10 + %6:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %11:vgpr(s32) = COPY %6(s32) + %7:vgpr(s32) = nnan G_FMINNUM_IEEE %5, %11 + $vgpr0 = COPY %7(s32) +... + +--- +name: test_min_max_K0Val_K1_f64 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_min_max_K0Val_K1_f64 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_FCONSTANT double 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY [[C]](s64) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s64) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s64) = nnan G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AMDGPU_CLAMP]](s64) + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %4:sgpr(s64) = G_FCONSTANT double 2.000000e+00 + %13:vgpr(s64) = COPY %4(s64) + %5:vgpr(s64) = G_FMUL %0, %13 + %6:sgpr(s64) = G_FCONSTANT double 0.000000e+00 + %14:vgpr(s64) = COPY %6(s64) + %7:vgpr(s64) = nnan G_FMAXNUM %14, %5 + %8:sgpr(s64) = G_FCONSTANT double 1.000000e+00 + %15:vgpr(s64) = COPY %8(s64) + %9:vgpr(s64) = nnan G_FMINNUM %7, %15 + $vgpr0_vgpr1 = COPY %9(s64) +... + +--- +name: test_min_K1max_ValK0_f16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_min_K1max_ValK0_f16 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4000 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[C]](s16) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s16) = G_FMUL [[TRUNC]], [[COPY1]] + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:vgpr(s16) = G_FCANONICALIZE [[FMUL]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s16) = G_AMDGPU_CLAMP [[FCANONICALIZE]] + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[AMDGPU_CLAMP]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + %2:vgpr(s32) = COPY $vgpr0 + %0:vgpr(s16) = G_TRUNC %2(s32) + %3:sgpr(s16) = G_FCONSTANT half 0xH4000 + %12:vgpr(s16) = COPY %3(s16) + %4:vgpr(s16) = G_FMUL %0, %12 + %5:sgpr(s16) = G_FCONSTANT half 0xH0000 + %11:vgpr(s16) = G_FCANONICALIZE %4 + %13:vgpr(s16) = COPY %5(s16) + %6:vgpr(s16) = G_FMAXNUM_IEEE %11, %13 + %7:sgpr(s16) = G_FCONSTANT half 0xH3C00 + %14:vgpr(s16) = COPY %7(s16) + %8:vgpr(s16) = G_FMINNUM_IEEE %14, %6 + %10:vgpr(s32) = G_ANYEXT %8(s16) + $vgpr0 = COPY %10(s32) +... + +--- +name: test_min_K1max_K0Val_f16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_min_K1max_K0Val_f16 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4000 + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[C]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(<2 x s16>) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(<2 x s16>) = nnan G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](<2 x s16>) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %3:sgpr(s16) = G_FCONSTANT half 0xH4000 + %12:sgpr(s32) = G_ANYEXT %3(s16) + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %12(s32), %12(s32) + %6:sgpr(s16) = G_FCONSTANT half 0xH0000 + %13:sgpr(s32) = G_ANYEXT %6(s16) + %5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %13(s32), %13(s32) + %9:sgpr(s16) = G_FCONSTANT half 0xH3C00 + %14:sgpr(s32) = G_ANYEXT %9(s16) + %8:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %14(s32), %14(s32) + %15:vgpr(<2 x s16>) = COPY %2(<2 x s16>) + %4:vgpr(<2 x s16>) = G_FMUL %0, %15 + %16:vgpr(<2 x s16>) = COPY %5(<2 x s16>) + %7:vgpr(<2 x s16>) = nnan G_FMAXNUM %16, %4 + %17:vgpr(<2 x s16>) = COPY %8(<2 x s16>) + %10:vgpr(<2 x s16>) = nnan G_FMINNUM %17, %7 + $vgpr0 = COPY %10(<2 x s16>) +... + +--- +name: test_min_max_splat_padded_with_undef +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_min_max_splat_padded_with_undef + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4000 + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[C]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(<2 x s16>) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:vgpr(<2 x s16>) = G_FCANONICALIZE [[FMUL]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(<2 x s16>) = G_AMDGPU_CLAMP [[FCANONICALIZE]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](<2 x s16>) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %3:sgpr(s16) = G_FCONSTANT half 0xH4000 + %17:sgpr(s32) = G_ANYEXT %3(s16) + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %17(s32), %17(s32) + %6:sgpr(s16) = G_FCONSTANT half 0xH0000 + %18:sgpr(s32) = G_ANYEXT %6(s16) + %19:sgpr(s32) = G_IMPLICIT_DEF + %5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %18(s32), %19(s32) + %10:sgpr(s16) = G_FCONSTANT half 0xH3C00 + %20:sgpr(s32) = G_ANYEXT %10(s16) + %9:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %20(s32), %19(s32) + %21:vgpr(<2 x s16>) = COPY %2(<2 x s16>) + %4:vgpr(<2 x s16>) = G_FMUL %0, %21 + %16:vgpr(<2 x s16>) = G_FCANONICALIZE %4 + %22:vgpr(<2 x s16>) = COPY %5(<2 x s16>) + %8:vgpr(<2 x s16>) = G_FMAXNUM_IEEE %22, %16 + %23:vgpr(<2 x s16>) = COPY %9(<2 x s16>) + %11:vgpr(<2 x s16>) = G_FMINNUM_IEEE %23, %8 + $vgpr0 = COPY %11(<2 x s16>) +... + +--- +name: test_max_min_ValK1_K0_f32 +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_max_min_ValK1_K0_f32 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + %9:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_FMUL %0, %9 + %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %10:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = nnan G_FMINNUM_IEEE %3, %10 + %6:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %11:vgpr(s32) = COPY %6(s32) + %7:vgpr(s32) = nnan G_FMAXNUM_IEEE %5, %11 + $vgpr0 = COPY %7(s32) +... + +--- +name: test_max_min_K1Val_K0_f64 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_max_min_K1Val_K0_f64 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_FCONSTANT double 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY [[C]](s64) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s64) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s64) = nnan G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AMDGPU_CLAMP]](s64) + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %4:sgpr(s64) = G_FCONSTANT double 2.000000e+00 + %13:vgpr(s64) = COPY %4(s64) + %5:vgpr(s64) = G_FMUL %0, %13 + %6:sgpr(s64) = G_FCONSTANT double 1.000000e+00 + %14:vgpr(s64) = COPY %6(s64) + %7:vgpr(s64) = nnan G_FMINNUM %14, %5 + %8:sgpr(s64) = G_FCONSTANT double 0.000000e+00 + %15:vgpr(s64) = COPY %8(s64) + %9:vgpr(s64) = nnan G_FMAXNUM %7, %15 + $vgpr0_vgpr1 = COPY %9(s64) +... + +--- +name: test_max_K0min_ValK1_f16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_max_K0min_ValK1_f16 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4000 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[C]](s16) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s16) = G_FMUL [[TRUNC]], [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s16) = nnan G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[AMDGPU_CLAMP]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + %2:vgpr(s32) = COPY $vgpr0 + %0:vgpr(s16) = G_TRUNC %2(s32) + %3:sgpr(s16) = G_FCONSTANT half 0xH4000 + %11:vgpr(s16) = COPY %3(s16) + %4:vgpr(s16) = G_FMUL %0, %11 + %5:sgpr(s16) = G_FCONSTANT half 0xH3C00 + %12:vgpr(s16) = COPY %5(s16) + %6:vgpr(s16) = nnan G_FMINNUM_IEEE %4, %12 + %7:sgpr(s16) = G_FCONSTANT half 0xH0000 + %13:vgpr(s16) = COPY %7(s16) + %8:vgpr(s16) = nnan G_FMAXNUM_IEEE %13, %6 + %10:vgpr(s32) = G_ANYEXT %8(s16) + $vgpr0 = COPY %10(s32) +... + +--- +name: test_max_K0min_K1Val_v2f16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_max_K0min_K1Val_v2f16 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s16) = G_FCONSTANT half 0xH4000 + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[C]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(<2 x s16>) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(<2 x s16>) = nnan G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](<2 x s16>) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %3:sgpr(s16) = G_FCONSTANT half 0xH4000 + %13:sgpr(s32) = G_ANYEXT %3(s16) + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %13(s32), %13(s32) + %6:sgpr(s16) = G_FCONSTANT half 0xH3C00 + %14:sgpr(s32) = G_ANYEXT %6(s16) + %15:sgpr(s32) = G_IMPLICIT_DEF + %5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %14(s32), %15(s32) + %10:sgpr(s16) = G_FCONSTANT half 0xH0000 + %16:sgpr(s32) = G_ANYEXT %10(s16) + %9:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %15(s32), %16(s32) + %17:vgpr(<2 x s16>) = COPY %2(<2 x s16>) + %4:vgpr(<2 x s16>) = G_FMUL %0, %17 + %18:vgpr(<2 x s16>) = COPY %5(<2 x s16>) + %8:vgpr(<2 x s16>) = nnan G_FMINNUM %18, %4 + %19:vgpr(<2 x s16>) = COPY %9(<2 x s16>) + %11:vgpr(<2 x s16>) = nnan G_FMAXNUM %19, %8 + $vgpr0 = COPY %11(<2 x s16>) +... + +# FixMe: add tests with attributes #3 = {"no-nans-fp-math"="true"} + +--- +name: test_min_max_K0_gt_K1 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_min_max_K0_gt_K1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:vgpr(s32) = nnan G_FMAXNUM_IEEE [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:vgpr(s32) = nnan G_FMINNUM_IEEE [[FMAXNUM_IEEE]], [[COPY2]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = nnan G_FMAXNUM_IEEE %0, %7 + %4:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = nnan G_FMINNUM_IEEE %3, %8 + $vgpr0 = COPY %5(s32) +... + +--- +name: test_max_min_K0_gt_K1 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_max_min_K0_gt_K1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:vgpr(s32) = nnan G_FMINNUM_IEEE [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:vgpr(s32) = nnan G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[COPY2]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = nnan G_FMINNUM_IEEE %0, %7 + %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = nnan G_FMAXNUM_IEEE %3, %8 + $vgpr0 = COPY %5(s32) +... + +--- +name: test_min_max_maybe_NaN_input_ieee_false +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_min_max_maybe_NaN_input_ieee_false + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:vgpr(s32) = G_FMAXNUM [[FMUL]], [[COPY2]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:vgpr(s32) = G_FMINNUM [[FMAXNUM]], [[COPY3]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + %9:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_FMUL %0, %9 + %4:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %10:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_FMAXNUM %3, %10 + %6:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %11:vgpr(s32) = COPY %6(s32) + %7:vgpr(s32) = G_FMINNUM %5, %11 + $vgpr0 = COPY %7(s32) +... + +--- +name: test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: false +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:vgpr(s32) = G_FCANONICALIZE [[FMUL]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FCANONICALIZE]], [[COPY2]], [[COPY3]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + %10:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_FMUL %0, %10 + %4:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %9:vgpr(s32) = G_FCANONICALIZE %3 + %11:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_FMAXNUM_IEEE %9, %11 + %6:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %12:vgpr(s32) = COPY %6(s32) + %7:vgpr(s32) = G_FMINNUM_IEEE %5, %12 + $vgpr0 = COPY %7(s32) +... + +--- +name: test_max_min_maybe_NaN_input_ieee_true +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_max_min_maybe_NaN_input_ieee_true + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:vgpr(s32) = G_FCANONICALIZE [[FMUL]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:vgpr(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[COPY2]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:vgpr(s32) = G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[COPY3]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + %10:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_FMUL %0, %10 + %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %9:vgpr(s32) = G_FCANONICALIZE %3 + %11:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_FMINNUM_IEEE %9, %11 + %6:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %12:vgpr(s32) = COPY %6(s32) + %7:vgpr(s32) = G_FMAXNUM_IEEE %5, %12 + $vgpr0 = COPY %7(s32) +... + +--- +name: test_max_min_maybe_NaN_input_ieee_false +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: true +body: | + bb.1 : + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_max_min_maybe_NaN_input_ieee_false + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:vgpr(s32) = G_FMINNUM [[FMUL]], [[COPY2]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:vgpr(s32) = G_FMAXNUM [[FMINNUM]], [[COPY3]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMAXNUM]](s32) + %0:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 + %9:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_FMUL %0, %9 + %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00 + %10:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_FMINNUM %3, %10 + %6:sgpr(s32) = G_FCONSTANT float 0.000000e+00 + %11:vgpr(s32) = COPY %6(s32) + %7:vgpr(s32) = G_FMAXNUM %5, %11 + $vgpr0 = COPY %7(s32) +...