Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -1484,8 +1484,15 @@ /// X|Cst == X+Cst iff X&Cst = 0. bool isBaseWithConstantOffset(SDValue Op) const; - /// Test whether the given SDValue is known to never be NaN. - bool isKnownNeverNaN(SDValue Op) const; + /// Test whether the given SDValue is known to never be NaN. If \p SNaN is + /// true, returns if \p Op is known to never be a signaling NaN (it may still + /// be a qNaN). + bool isKnownNeverNaN(SDValue Op, bool SNaN = false, unsigned Depth = 0) const; + + /// \returns true if \p Op is known to never be a signaling NaN. + bool isKnownNeverSNaN(SDValue Op, unsigned Depth = 0) const { + return isKnownNeverNaN(Op, true, Depth); + } /// Test whether the given floating point SDValue is known to never be /// positive or negative zero. Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -2867,6 +2867,13 @@ SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const; + /// If \p SNaN is false, \returns true if \p Op is known to never be any + /// NaN. If \p sNaN is true, returns if \p Op is known to never be a signaling + /// NaN. + virtual bool isKnownNeverNaNForTargetNode(SDValue Op, + const SelectionDAG &DAG, + bool SNaN = false, + unsigned Depth = 0) const; struct DAGCombinerInfo { void *DC; // The DAG Combiner object. CombineLevel Level; Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3594,21 +3594,102 @@ return true; } -bool SelectionDAG::isKnownNeverNaN(SDValue Op) const { +bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const { // If we're told that NaNs won't happen, assume they won't. - if (getTarget().Options.NoNaNsFPMath) + if (getTarget().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs()) return true; - if (Op->getFlags().hasNoNaNs()) - return true; + if (Depth == 6) + return false; // Limit search depth. + // TODO: Handle vectors. // If the value is a constant, we can obviously see if it is a NaN or not. - if (const ConstantFPSDNode *C = dyn_cast(Op)) - return !C->getValueAPF().isNaN(); + if (const ConstantFPSDNode *C = dyn_cast(Op)) { + return !C->getValueAPF().isNaN() || + (SNaN && !C->getValueAPF().isSignaling()); + } - // TODO: Recognize more cases here. + unsigned Opcode = Op.getOpcode(); + switch (Opcode) { + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: { + if (SNaN) + return true; + return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && + isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); + } + case ISD::FCANONICALIZE: + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FTRUNC: + case ISD::FFLOOR: + case ISD::FCEIL: + case ISD::FROUND: + case ISD::FRINT: + case ISD::FNEARBYINT: { + if (SNaN) + return true; + return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); + } + case ISD::FABS: + case ISD::FNEG: + case ISD::FCOPYSIGN: { + return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); + } + case ISD::SELECT: + return isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && + isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); + case ISD::FDIV: + case ISD::FREM: + case ISD::FSIN: + case ISD::FCOS: { + if (SNaN) + return true; + // TODO: Need isKnownNeverInfinity + return false; + } + case ISD::FP_EXTEND: + case ISD::FP_ROUND: { + if (SNaN) + return true; + return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); + } + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + return true; + case ISD::FMA: + case ISD::FMAD: { + if (SNaN) + return true; + return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && + isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && + isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); + } + case ISD::FSQRT: // Need is known positive + case ISD::FLOG: + case ISD::FLOG2: + case ISD::FLOG10: + case ISD::FPOWI: + case ISD::FPOW: { + if (SNaN) + return true; + // TODO: Refine on operand + return false; + } - return false; + // TODO: Handle FMINNUM/FMAXNUM/FMINNAN/FMAXNAN when there is an agreement on + // what they should do. + default: + if (Opcode >= ISD::BUILTIN_OP_END || + Opcode == ISD::INTRINSIC_WO_CHAIN || + Opcode == ISD::INTRINSIC_W_CHAIN || + Opcode == ISD::INTRINSIC_VOID) { + return TLI->isKnownNeverNaNForTargetNode(Op, *this, SNaN, Depth); + } + + return false; + } } bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) const { Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1711,6 +1711,19 @@ return false; } +bool TargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, + const SelectionDAG &DAG, + bool SNaN, + unsigned Depth) const { + assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || + Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN || + Op.getOpcode() == ISD::INTRINSIC_W_CHAIN || + Op.getOpcode() == ISD::INTRINSIC_VOID) && + "Should use isKnownNeverNaN if you don't know whether Op" + " is a target node!"); + return false; +} + // FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must // work with truncating build vectors and vectors with elements of less than // 8 bits. Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -246,6 +246,11 @@ const SelectionDAG &DAG, unsigned Depth = 0) const override; + bool isKnownNeverNaNForTargetNode(SDValue Op, + const SelectionDAG &DAG, + bool SNaN = false, + unsigned Depth = 0) const override; + /// Helper function that adds Reg to the LiveIn list of the DAG's /// MachineFunction. /// Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4323,3 +4323,86 @@ return 1; } } + +bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, + const SelectionDAG &DAG, + bool SNaN, + unsigned Depth) const { + unsigned Opcode = Op.getOpcode(); + switch (Opcode) { + case AMDGPUISD::FMIN_LEGACY: + case AMDGPUISD::FMAX_LEGACY: { + if (SNaN) + return true; + + // TODO: Can check no nans on one of the operands for each one, but which + // one? + return false; + } + case AMDGPUISD::FMUL_LEGACY: { + if (SNaN) + return true; + return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && + DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); + } + case AMDGPUISD::FMED3: + case AMDGPUISD::FMIN3: + case AMDGPUISD::FMAX3: + case AMDGPUISD::FMAD_FTZ: { + if (SNaN) + return true; + return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && + DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && + DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); + } + case AMDGPUISD::CVT_F32_UBYTE0: + case AMDGPUISD::CVT_F32_UBYTE1: + case AMDGPUISD::CVT_F32_UBYTE2: + case AMDGPUISD::CVT_F32_UBYTE3: + return true; + + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::RSQ_LEGACY: + case AMDGPUISD::RSQ_CLAMP: { + if (SNaN) + return true; + + // TODO: Need is known positive check. + return false; + } + case AMDGPUISD::LDEXP: { + if (SNaN) + return true; + return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); + } + case AMDGPUISD::DIV_SCALE: + case AMDGPUISD::DIV_FMAS: + case AMDGPUISD::DIV_FIXUP: + case AMDGPUISD::TRIG_PREOP: + // TODO: Refine on operands. + return SNaN; + case AMDGPUISD::SIN_HW: + case AMDGPUISD::COS_HW: { + // TODO: Need check for infinity + return SNaN; + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntrinsicID + = cast(Op.getOperand(0))->getZExtValue(); + // TODO: Handle more intrinsics + switch (IntrinsicID) { + case Intrinsic::amdgcn_cubeid: + return true; + + case Intrinsic::amdgcn_frexp_mant: + return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); + default: + return false; + } + } + default: + return false; + } +} Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -6717,13 +6717,6 @@ return AMDGPUTargetLowering::performRcpCombine(N, DCI); } -static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { - if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) - return true; - - return DAG.isKnownNeverNaN(Op); -} - static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, const GCNSubtarget *ST, unsigned MaxDepth=5) { // If source is a result of another standard FP operation it is already in @@ -6814,7 +6807,7 @@ bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); - if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) && + if ((IsIEEEMode || DAG.isKnownNeverSNaN(N0)) && isCanonicalized(DAG, N0, ST)) return N0; @@ -6959,7 +6952,7 @@ // then give the other result, which is different from med3 with a NaN // input. SDValue Var = Op0.getOperand(0); - if (!isKnownNeverSNan(DAG, Var)) + if (!DAG.isKnownNeverSNaN(Var)) return SDValue(); return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Index: test/CodeGen/AMDGPU/clamp.ll =================================================================== --- test/CodeGen/AMDGPU/clamp.ll +++ test/CodeGen/AMDGPU/clamp.ll @@ -53,9 +53,30 @@ ; GCN-LABEL: {{^}}v_clamp_negzero_f32: ; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]] ; GCN-DAG: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1 -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[SIGNBIT]], 1.0 +; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[SIGNBIT]], 1.0 define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd nnan float %a, 0.5 + %max = call float @llvm.maxnum.f32(float %add, float -0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; FIXME: Weird inconsistency in how -0.0 is treated. Accepted if clamp +; matched through med3, not if directly. Is this correct? + +; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[A]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]] +define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -352,13 +373,15 @@ ; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0 define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 - %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %a.nnan = fadd nnan float %a, 0.5 + %max = call float @llvm.maxnum.f32(float %a.nnan, float 0.0) %med = call float @llvm.minnum.f32(float %max, float 1.0) store float %med, float addrspace(1)* %out.gep @@ -367,13 +390,14 @@ ; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 0.5 clamp{{$}} define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 - %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %add = fadd float %a, 0.5 + %max = call float @llvm.maxnum.f32(float %add, float 0.0) %med = call float @llvm.minnum.f32(float %max, float 1.0) store float %med, float addrspace(1)* %out.gep Index: test/CodeGen/AMDGPU/fmed3.ll =================================================================== --- test/CodeGen/AMDGPU/fmed3.ll +++ test/CodeGen/AMDGPU/fmed3.ll @@ -1,9 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SNAN -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s ; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32: @@ -22,87 +19,82 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f32: -; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 - -; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define amdgpu_kernel void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_f32: +; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 + %a.add = fadd nnan float %a, 1.0 - %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %max = call float @llvm.maxnum.f32(float %a.add, float 2.0) %med = call float @llvm.minnum.f32(float %max, float 4.0) store float %med, float addrspace(1)* %outgep ret void } -; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute0_f32: -; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 - -; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define amdgpu_kernel void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_commute0_f32: +; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 + %a.add = fadd nnan float %a, 1.0 - %max = call float @llvm.maxnum.f32(float 2.0, float %a) + %max = call float @llvm.maxnum.f32(float 2.0, float %a.add) %med = call float @llvm.minnum.f32(float 4.0, float %max) store float %med, float addrspace(1)* %outgep ret void } -; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute1_f32: -; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 - -; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define amdgpu_kernel void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_commute1_f32: +; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 + %a.add = fadd nnan float %a, 1.0 - %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %max = call float @llvm.maxnum.f32(float %a.add, float 2.0) %med = call float @llvm.minnum.f32(float 4.0, float %max) store float %med, float addrspace(1)* %outgep ret void } -; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32: +; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_constant_order_f32: ; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -define amdgpu_kernel void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 + %a.add = fadd nnan float %a, 1.0 - %max = call float @llvm.maxnum.f32(float %a, float 4.0) + %max = call float @llvm.maxnum.f32(float %a.add, float 4.0) %med = call float @llvm.minnum.f32(float %max, float 2.0) store float %med, float addrspace(1)* %outgep ret void } - -; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32: +; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} ; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -define amdgpu_kernel void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { +define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 + %a.add = fadd nnan float %a, 1.0 - %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %max = call float @llvm.maxnum.f32(float %a.add, float 2.0) %med = call float @llvm.minnum.f32(float %max, float 4.0) store volatile float %med, float addrspace(1)* %outgep @@ -118,8 +110,9 @@ %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid %a = load double, double addrspace(1)* %gep0 + %a.add = fadd nnan double %a, 1.0 - %max = call double @llvm.maxnum.f64(double %a, double 2.0) + %max = call double @llvm.maxnum.f64(double %a.add, double 2.0) %med = call double @llvm.minnum.f64(double %max, double 4.0) store double %med, double addrspace(1)* %outgep @@ -142,19 +135,17 @@ } ; GCN-LABEL: {{^}}v_test_legacy_fmed3_r_i_i_f32: -; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 - -; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 + %a.nnan = fadd nnan float %a, 1.0 ; fmax_legacy - %cmp0 = fcmp ule float %a, 2.0 - %max = select i1 %cmp0, float 2.0, float %a + %cmp0 = fcmp ule float %a.nnan, 2.0 + %max = select i1 %cmp0, float 2.0, float %a.nnan ; fmin_legacy %cmp1 = fcmp uge float %max, 4.0 Index: test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -24,16 +24,16 @@ ; VI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { - %a11 = fadd fast float %y, -1.0 + %a11 = fadd float %y, -1.0 %a12 = call float @llvm.fabs.f32(float %a11) - %a13 = fadd fast float %x, -1.0 + %a13 = fadd float %x, -1.0 %a14 = call float @llvm.fabs.f32(float %a13) %a15 = fcmp ogt float %a12, %a14 %a16 = select i1 %a15, float %a12, float %a14 - %a17 = fmul fast float %a16, 2.0 - %a18 = fmul fast float %a17, %a17 - %a19 = fmul fast float %a18, %a17 - %a20 = fsub fast float 1.0, %a19 + %a17 = fmul float %a16, 2.0 + %a18 = fmul float %a17, %a17 + %a19 = fmul float %a18, %a17 + %a20 = fsub float 1.0, %a19 store float %a20, float addrspace(1)* %out ret void } @@ -123,16 +123,16 @@ %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half %z = bitcast i16 %z.arg to half - %a11 = fadd fast half %y, -1.0 + %a11 = fadd half %y, -1.0 %a12 = call half @llvm.fabs.f16(half %a11) - %a13 = fadd fast half %x, -1.0 + %a13 = fadd half %x, -1.0 %a14 = call half @llvm.fabs.f16(half %a13) %a15 = fcmp ogt half %a12, %a14 %a16 = select i1 %a15, half %a12, half %a14 - %a17 = fmul fast half %a16, 2.0 - %a18 = fmul fast half %a17, %a17 - %a19 = fmul fast half %a18, %a17 - %a20 = fsub fast half 1.0, %a19 + %a17 = fmul half %a16, 2.0 + %a18 = fmul half %a17, %a17 + %a19 = fmul half %a18, %a17 + %a20 = fsub half 1.0, %a19 store half %a20, half addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/known-never-snan.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/known-never-snan.ll @@ -0,0 +1,598 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; Mostly overlaps with fmed3.ll to stress specific cases of +; isKnownNeverSNaN. + +define float @v_test_known_not_snan_fabs_input_fmed3_r_i_i_f32(float %a) #0 { +; GCN-LABEL: v_test_known_not_snan_fabs_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, |v0|, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %a.nnan.add = fdiv nnan float 1.0, %a + %known.not.snan = call float @llvm.fabs.f32(float %a.nnan.add) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_fneg_input_fmed3_r_i_i_f32(float %a) #0 { +; GCN-LABEL: v_test_known_not_snan_fneg_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %a.nnan.add = fdiv nnan float 1.0, %a + %known.not.snan = fsub float -0.0, %a.nnan.add + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_fpext_input_fmed3_r_i_i_f32(half %a) #0 { +; GCN-LABEL: v_test_known_not_snan_fpext_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %a.nnan.add = fadd nnan half %a, 1.0 + %known.not.snan = fpext half %a.nnan.add to float + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_fptrunc_input_fmed3_r_i_i_f32(double %a) #0 { +; GCN-LABEL: v_test_known_not_snan_fptrunc_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %a.nnan.add = fadd nnan double %a, 1.0 + %known.not.snan = fptrunc double %a.nnan.add to float + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_copysign_input_fmed3_r_i_i_f32(float %a, float %sign) #0 { +; GCN-LABEL: v_test_known_not_snan_copysign_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: s_brev_b32 s6, -2 +; GCN-NEXT: v_bfi_b32 v0, s6, v0, v1 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %a.nnan.add = fdiv nnan float 1.0, %a + %known.not.snan = call float @llvm.copysign.f32(float %a.nnan.add, float %sign) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +; Canonicalize always quiets, so nothing is necessary. +define float @v_test_known_canonicalize_input_fmed3_r_i_i_f32(float %a) #0 { +; GCN-LABEL: v_test_known_canonicalize_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.canonicalize.f32(float %a) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32(float %a, float %b) #0 { +; GCN-LABEL: v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %a.nnan.add = fdiv nnan float 1.0, %a + %b.nnan.add = fadd nnan float %b, 1.0 + %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b.nnan.add) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 { +; GCN-LABEL: v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %b.nnan.add = fadd nnan float %b, 1.0 + %known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nnan.add) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_minnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 { +; GCN-LABEL: v_minnum_possible_nan_rhs_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %a.nnan.add = fdiv nnan float 1.0, %a + %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_maxnum_input_fmed3_r_i_i_f32(float %a, float %b) #0 { +; GCN-LABEL: v_test_known_not_snan_maxnum_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %a.nnan.add = fdiv nnan float 1.0, %a + %b.nnan.add = fadd nnan float %b, 1.0 + %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b.nnan.add) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_maxnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 { +; GCN-LABEL: v_maxnum_possible_nan_lhs_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %b.nnan.add = fadd nnan float %b, 1.0 + %known.not.snan = call float @llvm.maxnum.f32(float %a, float %b.nnan.add) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_maxnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 { +; GCN-LABEL: v_maxnum_possible_nan_rhs_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %a.nnan.add = fdiv nnan float 1.0, %a + %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_select_input_fmed3_r_i_i_f32(float %a, float %b, i32 %c) #0 { +; GCN-LABEL: v_test_known_not_snan_select_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %a.nnan.add = fdiv nnan float 1.0, %a + %b.nnan.add = fadd nnan float %b, 1.0 + %cmp = icmp eq i32 %c, 0 + %known.not.snan = select i1 %cmp, float %a.nnan.add, float %b.nnan.add + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_select_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b, i32 %c) #0 { +; GCN-LABEL: v_select_possible_nan_lhs_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %b.nnan.add = fadd nnan float %b, 1.0 + %cmp = icmp eq i32 %c, 0 + %known.not.snan = select i1 %cmp, float %a, float %b.nnan.add + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_select_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b, i32 %c) #0 { +; GCN-LABEL: v_select_possible_nan_rhs_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %a.nnan.add = fdiv nnan float 1.0, %a + %cmp = icmp eq i32 %c, 0 + %known.not.snan = select i1 %cmp, float %a.nnan.add, float %b + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_fadd_input_fmed3_r_i_i_f32(float %a, float %b) #0 { +; GCN-LABEL: v_test_known_not_snan_fadd_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = fadd float %a, %b + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_fsub_input_fmed3_r_i_i_f32(float %a, float %b) #0 { +; GCN-LABEL: v_test_known_not_snan_fsub_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = fsub float %a, %b + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_fmul_input_fmed3_r_i_i_f32(float %a, float %b) #0 { +; GCN-LABEL: v_test_known_not_snan_fmul_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = fmul float %a, %b + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_uint_to_fp_input_fmed3_r_i_i_f32(i32 %a) #0 { +; GCN-LABEL: v_test_known_not_snan_uint_to_fp_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = uitofp i32 %a to float + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_sint_to_fp_input_fmed3_r_i_i_f32(i32 %a) #0 { +; GCN-LABEL: v_test_known_not_snan_sint_to_fp_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = sitofp i32 %a to float + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_fma_input_fmed3_r_i_i_f32(float %a, float %b, float %c) #0 { +; GCN-LABEL: v_test_known_not_snan_fma_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_fma_f32 v0, v0, v1, v2 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.fma.f32(float %a, float %b, float %c) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_fmad_input_fmed3_r_i_i_f32(float %a, float %b, float %c) #0 { +; GCN-LABEL: v_test_known_not_snan_fmad_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mac_f32_e32 v2, v0, v1 +; GCN-NEXT: v_med3_f32 v0, v2, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.fmuladd.f32(float %a, float %b, float %c) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + + +define float @v_test_known_not_snan_sin_input_fmed3_r_i_i_f32(float %a) #0 { +; GCN-LABEL: v_test_known_not_snan_sin_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 0.15915494, v0 +; GCN-NEXT: v_fract_f32_e32 v0, v0 +; GCN-NEXT: v_sin_f32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.sin.f32(float %a) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_cos_input_fmed3_r_i_i_f32(float %a) #0 { +; GCN-LABEL: v_test_known_not_snan_cos_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 0.15915494, v0 +; GCN-NEXT: v_fract_f32_e32 v0, v0 +; GCN-NEXT: v_cos_f32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.cos.f32(float %a) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_exp2_input_fmed3_r_i_i_f32(float %a) #0 { +; GCN-LABEL: v_test_known_not_snan_exp2_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.exp2.f32(float %a) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_trunc_input_fmed3_r_i_i_f32(float %a) #0 { +; GCN-LABEL: v_test_known_not_snan_trunc_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_trunc_f32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.trunc.f32(float %a) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_floor_input_fmed3_r_i_i_f32(float %a) #0 { +; GCN-LABEL: v_test_known_not_snan_floor_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_floor_f32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.floor.f32(float %a) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_ceil_input_fmed3_r_i_i_f32(float %a) #0 { +; GCN-LABEL: v_test_known_not_snan_ceil_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_floor_f32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.floor.f32(float %a) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_round_input_fmed3_r_i_i_f32(float %a) #0 { +; GCN-LABEL: v_test_known_not_snan_round_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_brev_b32 s6, -2 +; GCN-NEXT: v_trunc_f32_e32 v2, v0 +; GCN-NEXT: v_bfi_b32 v1, s6, 1.0, v0 +; GCN-NEXT: v_sub_f32_e32 v0, v0, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GCN-NEXT: v_add_f32_e32 v0, v2, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.round.f32(float %a) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_rint_input_fmed3_r_i_i_f32(float %a) #0 { +; GCN-LABEL: v_test_known_not_snan_rint_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rndne_f32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.rint.f32(float %a) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_nearbyint_input_fmed3_r_i_i_f32(float %a) #0 { +; GCN-LABEL: v_test_known_not_snan_nearbyint_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rndne_f32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.nearbyint.f32(float %a) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_fmul_legacy_input_fmed3_r_i_i_f32(float %a, float %b) #0 { +; GCN-LABEL: v_test_known_not_snan_fmul_legacy_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_ldexp_input_fmed3_r_i_i_f32(float %a, i32 %b) #0 { +; GCN-LABEL: v_test_known_not_snan_ldexp_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_ldexp_f32 v0, v0, v1 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.amdgcn.ldexp.f32(float %a, i32 %b) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_fmed3_input_fmed3_r_i_i_f32(float %a, float %b, float %c) #0 { +; GCN-LABEL: v_test_known_not_snan_fmed3_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_med3_f32 v0, v0, v1, v2 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.amdgcn.fmed3.f32(float %a, float %b, float %c) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_fmin3_input_fmed3_r_i_i_f32(float %a, float %b, float %c) #0 { +; GCN-LABEL: v_test_known_not_snan_fmin3_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_min3_f32 v0, v0, v1, v2 +; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %min0 = call float @llvm.minnum.f32(float %a, float %b) + %known.not.snan = call float @llvm.minnum.f32(float %min0, float %c) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_cvt_ubyte0_input_fmed3_r_i_i_f32(i8 %char) #0 { +; GCN-LABEL: v_test_known_not_snan_cvt_ubyte0_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %cvt = uitofp i8 %char to float + %max = call float @llvm.maxnum.f32(float %cvt, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_not_known_frexp_mant_input_fmed3_r_i_i_f32(float %arg) #0 { +; GCN-LABEL: v_test_not_known_frexp_mant_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.amdgcn.frexp.mant.f32(float %arg) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_frexp_mant_input_fmed3_r_i_i_f32(float %arg) #0 { +; GCN-LABEL: v_test_known_not_frexp_mant_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %add = fadd float %arg, 1.0 + %known.not.snan = call float @llvm.amdgcn.frexp.mant.f32(float %add) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_snan_cubeid_input_fmed3_r_i_i_f32(float %a, float %b, float %c) #0 { +; GCN-LABEL: v_test_known_not_snan_cubeid_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cubeid_f32 v0, v0, v1, v2 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %known.not.snan = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.sin.f32(float) #1 +declare float @llvm.cos.f32(float) #1 +declare float @llvm.exp2.f32(float) #1 +declare float @llvm.trunc.f32(float) #1 +declare float @llvm.floor.f32(float) #1 +declare float @llvm.ceil.f32(float) #1 +declare float @llvm.round.f32(float) #1 +declare float @llvm.rint.f32(float) #1 +declare float @llvm.nearbyint.f32(float) #1 +declare float @llvm.canonicalize.f32(float) #1 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 +declare float @llvm.copysign.f32(float, float) #1 +declare float @llvm.fma.f32(float, float, float) #1 +declare float @llvm.fmuladd.f32(float, float, float) #1 +declare float @llvm.amdgcn.ldexp.f32(float, i32) #1 +declare float @llvm.amdgcn.fmul.legacy(float, float) #1 +declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 +declare float @llvm.amdgcn.frexp.mant.f32(float) #1 +declare float @llvm.amdgcn.cubeid(float, float, float) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable }