Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -37,17 +37,24 @@ [{ return matchCvtF32UByteN(*${cvt_f32_ubyteN}, MRI, *MF, ${matchinfo}); }]), (apply [{ applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; -def minmax_to_med3_matchdata : GIDefMatchData<"MinMaxToMed3MatchInfo">; -def minmax_to_med3 : GICombineRule< - (defs root:$min_or_max, minmax_to_med3_matchdata:$matchinfo), +def minmax_to_med3_or_clamp_matchdata : GIDefMatchData<"MinMaxToMed3OrClampMatchInfo">; +def minmax_to_med3_or_clamp : GICombineRule< + (defs root:$min_or_max, minmax_to_med3_or_clamp_matchdata:$matchinfo), (match (wip_match_opcode G_SMAX, G_SMIN, G_UMAX, G_UMIN, G_FMINNUM, G_FMAXNUM):$min_or_max, - [{ return matchMinMaxToMed3(*${min_or_max}, MRI, ${matchinfo}); }]), - (apply [{ applyMinMaxToMed3(*${min_or_max}, ${matchinfo}); }])>; + [{ return matchMinMaxToMed3OrClamp(*${min_or_max}, MRI, ${matchinfo}); }]), + (apply [{ applyMinMaxToMed3OrClamp(*${min_or_max}, ${matchinfo}); }])>; + +def fmed3_to_clamp_matchdata : GIDefMatchData<"FMed3ToClampMatchInfo">; +def fmed3_to_clamp : GICombineRule< + (defs root:$fmed3, fmed3_to_clamp_matchdata:$matchinfo), + (match (wip_match_opcode G_INTRINSIC):$fmed3, + [{ return matchFMed3ToClamp(*${fmed3}, MRI, ${matchinfo}); }]), + (apply [{ applyFMed3ToClamp(*${fmed3}, ${matchinfo}); }])>; // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -61,7 +68,7 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, - uchar_to_float, cvt_f32_ubyteN, minmax_to_med3]> { + uchar_to_float, cvt_f32_ubyteN, minmax_to_med3_or_clamp, fmed3_to_clamp]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; } Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -213,6 +213,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; class GISelSop2Pat < SDPatternOperator node, Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -124,13 +124,13 @@ } // end anonymous namespace -struct MinMaxToMed3MatchInfo { +struct MinMaxToMed3OrClampMatchInfo { unsigned Opc; Register Val0, Val1, Val2; }; -static bool matchMinMaxToMed3(MachineInstr &MI, MachineRegisterInfo &MRI, - MinMaxToMed3MatchInfo &MatchInfo) { +static bool matchMinMaxToMed3OrClamp(MachineInstr &MI, MachineRegisterInfo &MRI, + MinMaxToMed3OrClampMatchInfo &MatchInfo) { unsigned Opc = MI.getOpcode(); MinMaxMedOpc OpcodeTriple = getMinMaxPair(Opc); if (!OpcodeTriple.Min) @@ -177,6 +177,19 @@ assert(!Info->getMode().IEEE && "G_FMINNUM was supposed to be lowered to " "G_FMINNUM_IEEE with IEEE=true"); + // Clamp is safe to fold for non-NaN input (we primarily want to check for + // no-NaN flags). When one input is NaN and with IEEE=false both G_FMINNUM + // and G_FMAXNUM return non-NaN input thus input pattern evaluates to 0.0. + // This is safe to clamp only with DX10Clamp=true (when true clamps NaN to + // 0.0, otherwise lets NaN through). + if (KO_FPImm.isExactlyValue(0.0) && K1_FPImm.isExactlyValue(1.0)) { + Register InnerInstDef = InnerInst->getOperand(0).getReg(); + if (Info->getMode().DX10Clamp || isKnownNeverNaN(InnerInstDef, MRI)) { + MatchInfo = {AMDGPU::G_AMDGPU_CLAMP, ValDef, 0, 0}; + return true; + } + } + const SIInstrInfo *TII = MF->getSubtarget().getInstrInfo(); APInt KObits = KO_FPImm.bitcastToAPInt(); APInt K1bits = K1_FPImm.bitcastToAPInt(); @@ -191,11 +204,77 @@ return false; } -static void applyMinMaxToMed3(MachineInstr &MI, - MinMaxToMed3MatchInfo &MatchInfo) { +static void applyMinMaxToMed3OrClamp(MachineInstr &MI, + MinMaxToMed3OrClampMatchInfo &MatchInfo) { + MachineIRBuilder B(MI); + if (MatchInfo.Opc == AMDGPU::G_AMDGPU_CLAMP) + B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {MatchInfo.Val0}); + else + B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)}, + {MatchInfo.Val0, MatchInfo.Val1, MatchInfo.Val2}); + MI.eraseFromParent(); +} + +struct FMed3ToClampMatchInfo { + Register Val; +}; + +bool isOperandExactlyValue(const MachineInstr &MI, unsigned Idx, + MachineRegisterInfo &MRI, double Val) { + const MachineInstr *Inst = MRI.getVRegDef(MI.getOperand(Idx).getReg()); + if (Inst->getOpcode() == AMDGPU::G_FCONSTANT && + Inst->getOperand(1).getFPImm()->isExactlyValue(Val)) + return true; + return false; +} + +static bool matchFMed3ToClamp(MachineInstr &MI, MachineRegisterInfo &MRI, + FMed3ToClampMatchInfo &MatchInfo) { + if (MI.getOpcode() == AMDGPU::G_INTRINSIC && + MI.getIntrinsicID() == Intrinsic::amdgcn_fmed3) { + // In llvm IR, clamp is often represented as intrinsic call to + // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders. + unsigned ValIdx = 2, ZeroIdx = 3, OneIdx = 4; + // Find index of the operand with 0.0. + if (!isOperandExactlyValue(MI, ZeroIdx, MRI, 0.0)) { + std::swap(ZeroIdx, ValIdx); + if (!isOperandExactlyValue(MI, ZeroIdx, MRI, 0.0)) { + std::swap(ZeroIdx, OneIdx); + if (!isOperandExactlyValue(MI, ZeroIdx, MRI, 0.0)) + return false; + } + } + // Find index of the operand with 1.0, remainig index is Val. + if (!isOperandExactlyValue(MI, OneIdx, MRI, 1.0)) { + std::swap(OneIdx, ValIdx); + if (!isOperandExactlyValue(MI, OneIdx, MRI, 1.0)) + return false; + } + + const MachineFunction *MF = MI.getMF(); + const SIMachineFunctionInfo *Info = MF->getInfo(); + // It is safe to clamp for non-NaN input. For NaN input, fmed3(a, b, c) is + // equivalent to min(min(a, b), c). With IEEE=false, when one input is NaN + // min returns non-NaN input. Thus fmed3(NaN, 0.0, 1.0) evaluates to 0.0 for + // all operand permutations. This is safe to clamp only with DX10Clamp=true + // (when true clamps NaN to 0.0, otherwise lets NaN through). + if (!Info->getMode().IEEE) { + Register Def = MI.getOperand(0).getReg(); + if (Info->getMode().DX10Clamp || isKnownNeverNaN(Def, MRI)) { + MatchInfo.Val = MI.getOperand(ValIdx).getReg(); + return true; + } + } + // TODO: IEEE=true. + } + return false; +} + +static void applyFMed3ToClamp(MachineInstr &MI, + FMed3ToClampMatchInfo &MatchInfo) { MachineIRBuilder B(MI); - B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)}, - {MatchInfo.Val0, MatchInfo.Val1, MatchInfo.Val2}); + + B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {MatchInfo.Val}); MI.eraseFromParent(); } Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3643,6 +3643,7 @@ case AMDGPU::G_AMDGPU_SMED3: case AMDGPU::G_AMDGPU_UMED3: case AMDGPU::G_AMDGPU_FMED3: + case AMDGPU::G_AMDGPU_CLAMP: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2699,3 +2699,9 @@ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); let hasSideEffects = 0; } + +def G_AMDGPU_CLAMP : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/clamp.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/clamp.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps float @test_maybe_nan_input_dx10clamp_true(float %a, float %b) { +; GFX10-LABEL: test_maybe_nan_input_dx10clamp_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f32_e64 v0, v0, v1 clamp +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %mul = fmul float %a, %b + %maxnum = call float @llvm.maxnum.f32(float %mul, float 0.0) + %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0) + ret float %fmed +} + +define amdgpu_ps float @test_maybe_nan_input_dx10clamp_false(float %a, float %b) #0 { +; GFX10-LABEL: test_maybe_nan_input_dx10clamp_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GFX10-NEXT: ; return to shader part epilog + %mul = fmul float %a, %b + %maxnum = call float @llvm.maxnum.f32(float %mul, float 0.0) + %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0) + ret float %fmed +} + +; FIXME: in isKnownNeverNaN +define amdgpu_ps float @test_nnan_input_dx10clamp_false(float %a, float %b) #0 { +; GFX10-LABEL: test_nnan_input_dx10clamp_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GFX10-NEXT: ; return to shader part epilog + %mul = fmul nnan float %a, %b + %maxnum = call float @llvm.maxnum.f32(float %mul, float 0.0) + %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0) + ret float %fmed +} + +define amdgpu_ps float @test_nnan_innernode_dx10clamp_false(float %a, float %b) #0 { +; GFX10-LABEL: test_nnan_innernode_dx10clamp_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f32_e64 v0, v0, v1 clamp +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %mul = fmul float %a, %b + %maxnum = call nnan float @llvm.maxnum.f32(float %mul, float 0.0) + %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0) + ret float %fmed +} + +define amdgpu_ps float @test_fmed3_dx10clamp_true(float %a, float %b) { +; GFX10-LABEL: test_fmed3_dx10clamp_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f32_e64 v0, v0, v1 clamp +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %mul = fmul float %a, %b + %fmed = call float @llvm.amdgcn.fmed3.f32(float %mul, float 0.0, float 1.0) + ret float %fmed +} + +define amdgpu_ps float @test_fmed3_dx10clamp_false(float %a, float %b) #0 { +; GFX10-LABEL: test_fmed3_dx10clamp_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GFX10-NEXT: ; return to shader part epilog + %mul = fmul float %a, %b + %fmed = call float @llvm.amdgcn.fmed3.f32(float %mul, float 0.0, float 1.0) + ret float %fmed +} + +define amdgpu_ps float @test_nnan_fmed3_dx10clamp_false(float %a, float %b) #0 { +; GFX10-LABEL: test_nnan_fmed3_dx10clamp_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f32_e64 v0, v0, v1 clamp +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog + %mul = fmul float %a, %b + %fmed = call nnan float @llvm.amdgcn.fmed3.f32(float %mul, float 0.0, float 1.0) + ret float %fmed +} + +declare float @llvm.minnum.f32(float, float) +declare float @llvm.maxnum.f32(float, float) +declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 + +attributes #0 = {"amdgpu-dx10-clamp"="false"} +attributes #1 = { nounwind readnone speculatable willreturn } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-clamp.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-clamp.mir @@ -0,0 +1,223 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: test_maybe_nan_input_dx10clamp_true +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: true +body: | + bb.1: + liveins: $vgpr0, $vgpr1 + + ; GCN-LABEL: name: test_maybe_nan_input_dx10clamp_true + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] + ; GCN: [[AMDGPU_CLAMP:%[0-9]+]]:_(s32) = G_AMDGPU_CLAMP [[FMUL]] + ; GCN: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %4:_(s32) = G_FMUL %0, %1 + %5:_(s32) = G_FCONSTANT float 0.000000e+00 + %6:_(s32) = G_FMAXNUM %4, %5 + %7:_(s32) = G_FCONSTANT float 1.000000e+00 + %8:_(s32) = G_FMINNUM %6, %7 + $vgpr0 = COPY %8(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: test_maybe_nan_input_dx10clamp_false +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: false +body: | + bb.1: + liveins: $vgpr0, $vgpr1 + + ; GCN-LABEL: name: test_maybe_nan_input_dx10clamp_false + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] + ; GCN: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GCN: [[AMDGPU_FMED3_:%[0-9]+]]:_(s32) = G_AMDGPU_FMED3 [[FMUL]], [[C]], [[C1]] + ; GCN: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %4:_(s32) = G_FMUL %0, %1 + %5:_(s32) = G_FCONSTANT float 0.000000e+00 + %6:_(s32) = G_FMAXNUM %4, %5 + %7:_(s32) = G_FCONSTANT float 1.000000e+00 + %8:_(s32) = G_FMINNUM %6, %7 + $vgpr0 = COPY %8(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +# FIXME: in isKnownNeverNaN +--- +name: test_nnan_input_dx10clamp_false +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: false +body: | + bb.1: + liveins: $vgpr0, $vgpr1 + + ; GCN-LABEL: name: test_nnan_input_dx10clamp_false + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[COPY]], [[COPY1]] + ; GCN: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GCN: [[AMDGPU_FMED3_:%[0-9]+]]:_(s32) = G_AMDGPU_FMED3 [[FMUL]], [[C]], [[C1]] + ; GCN: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %4:_(s32) = nnan G_FMUL %0, %1 + %5:_(s32) = G_FCONSTANT float 0.000000e+00 + %6:_(s32) = G_FMAXNUM %4, %5 + %7:_(s32) = G_FCONSTANT float 1.000000e+00 + %8:_(s32) = G_FMINNUM %6, %7 + $vgpr0 = COPY %8(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: test_nnan_innernode_dx10clamp_false +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: false +body: | + bb.1: + liveins: $vgpr0, $vgpr1 + + ; GCN-LABEL: name: test_nnan_innernode_dx10clamp_false + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] + ; GCN: [[AMDGPU_CLAMP:%[0-9]+]]:_(s32) = G_AMDGPU_CLAMP [[FMUL]] + ; GCN: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %4:_(s32) = G_FMUL %0, %1 + %5:_(s32) = G_FCONSTANT float 0.000000e+00 + %6:_(s32) = nnan G_FMAXNUM %4, %5 + %7:_(s32) = G_FCONSTANT float 1.000000e+00 + %8:_(s32) = G_FMINNUM %6, %7 + $vgpr0 = COPY %8(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: test_fmed3_dx10clamp_true +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: true +body: | + bb.1: + liveins: $vgpr0, $vgpr1 + + ; GCN-LABEL: name: test_fmed3_dx10clamp_true + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] + ; GCN: [[AMDGPU_CLAMP:%[0-9]+]]:_(s32) = G_AMDGPU_CLAMP [[FMUL]] + ; GCN: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %4:_(s32) = G_FMUL %0, %1 + %7:_(s32) = G_FCONSTANT float 1.000000e+00 + %6:_(s32) = G_FCONSTANT float 0.000000e+00 + %5:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %4(s32), %6(s32), %7(s32) + $vgpr0 = COPY %5(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: test_fmed3_dx10clamp_false +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: false +body: | + bb.1: + liveins: $vgpr0, $vgpr1 + + ; GCN-LABEL: name: test_fmed3_dx10clamp_false + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] + ; GCN: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GCN: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), [[FMUL]](s32), [[C1]](s32), [[C]](s32) + ; GCN: $vgpr0 = COPY [[INT]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %4:_(s32) = G_FMUL %0, %1 + %7:_(s32) = G_FCONSTANT float 1.000000e+00 + %6:_(s32) = G_FCONSTANT float 0.000000e+00 + %5:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %4(s32), %6(s32), %7(s32) + $vgpr0 = COPY %5(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +... + +--- +name: test_nnan_fmed3_dx10clamp_false +legalized: true +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + dx10-clamp: false +body: | + bb.1: + liveins: $vgpr0, $vgpr1 + + ; GCN-LABEL: name: test_nnan_fmed3_dx10clamp_false + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] + ; GCN: [[AMDGPU_CLAMP:%[0-9]+]]:_(s32) = G_AMDGPU_CLAMP [[FMUL]] + ; GCN: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %4:_(s32) = G_FMUL %0, %1 + %7:_(s32) = G_FCONSTANT float 1.000000e+00 + %6:_(s32) = G_FCONSTANT float 0.000000e+00 + %5:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %4(s32), %6(s32), %7(s32) + $vgpr0 = COPY %5(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 +...