Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -310,6 +310,11 @@ bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override; + + bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op, + unsigned Depth = 0) const; + + bool denormalsEnabledForType(EVT VT) const; }; } // End namespace llvm Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -6717,11 +6717,86 @@ return AMDGPUTargetLowering::performRcpCombine(N, DCI); } -static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { - if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) +bool SITargetLowering::isKnownNeverSNan(SelectionDAG &DAG, SDValue Op, + unsigned Depth) const { + if (Depth >= 6) + return false; + + switch (Op.getOpcode()) { + case ISD::ConstantFP: { + ConstantFPSDNode *C = cast(Op); + return !C->getValueAPF().isNaN() || + !C->getValueAPF().isSignaling(); + } + case ISD::FMINNUM: + case ISD::FMAXNUM: { + bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + return IsIEEEMode || (isKnownNeverSNan(DAG, Op.getOperand(0), Depth + 1) && + isKnownNeverSNan(DAG, Op.getOperand(1), Depth + 1)); + } + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FMAD: + case ISD::FCANONICALIZE: + case AMDGPUISD::FMED3: + case AMDGPUISD::FMIN3: + case AMDGPUISD::FMAX3: + case AMDGPUISD::FMIN_LEGACY: + case AMDGPUISD::FMAX_LEGACY: + case AMDGPUISD::CLAMP: return true; + case ISD::FDIV: + case ISD::FREM: + case ISD::FMA: + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + case ISD::FSQRT: + case ISD::FSIN: + case ISD::FCOS: + case ISD::FPOWI: + case ISD::FPOW: + case ISD::FLOG: + case ISD::FLOG2: + case ISD::FLOG10: + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FFLOOR: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RSQ_CLAMP: + case AMDGPUISD::CVT_F32_UBYTE0: + case AMDGPUISD::CVT_F32_UBYTE1: + case AMDGPUISD::CVT_F32_UBYTE2: + case AMDGPUISD::CVT_F32_UBYTE3: + // TODO: This could be refined based on operands. + return !DAG.getTargetLoweringInfo().hasFloatingPointExceptions() || + Op->getFlags().hasNoNaNs() || + DAG.getTarget().Options.NoNaNsFPMath; + case ISD::FNEG: + case ISD::FABS: + case ISD::FCOPYSIGN: + case ISD::FP_EXTEND: + case AMDGPUISD::FP16_ZEXT: + case AMDGPUISD::FP_TO_FP16: + case AMDGPUISD::CVT_PKRTZ_F16_F32: + return isKnownNeverSNan(DAG, Op.getOperand(0), Depth + 1); + + case ISD::SELECT: + return isKnownNeverSNan(DAG, Op.getOperand(1), Depth + 1) && + isKnownNeverSNan(DAG, Op.getOperand(2), Depth + 1); - return DAG.isKnownNeverNaN(Op); + case ISD::FMAXNAN: + case ISD::FMINNAN: + // TODO: What do these do for snans? + default: + return false; + } } static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, Index: test/CodeGen/AMDGPU/clamp.ll =================================================================== --- test/CodeGen/AMDGPU/clamp.ll +++ test/CodeGen/AMDGPU/clamp.ll @@ -54,13 +54,15 @@ ; GCN-LABEL: {{^}}v_clamp_negzero_f32: ; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] ; GCN-DAG: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1 -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[SIGNBIT]], 1.0 +; GCN-DAG: v_add_f32_e32 [[QUIET:v[0-9]+]], 0, [[A]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[QUIET]], [[SIGNBIT]], 1.0 define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 - %max = call float @llvm.maxnum.f32(float %a, float -0.0) + %quiet = fadd float %a, 0.0 + %max = call float @llvm.maxnum.f32(float %quiet, float -0.0) %med = call float @llvm.minnum.f32(float %max, float 1.0) store float %med, float addrspace(1)* %out.gep @@ -352,13 +354,15 @@ ; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 +; GCN: v_add_f32_e32 [[QUIET:v[0-9]+]], 0, [[A]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[QUIET]], 0, 1.0 define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 - %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %quiet = fadd float %a, 0.0 + %max = call float @llvm.maxnum.f32(float %quiet, float 0.0) %med = call float @llvm.minnum.f32(float %max, float 1.0) store float %med, float addrspace(1)* %out.gep Index: test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -457,11 +457,11 @@ ret void } -; GCN-LABEL: test_no_fold_canonicalize_fmul_value_f32_no_ieee: +; GCN-LABEL: test_no_fold_canonicalize_fdiv_value_f32_no_ieee: ; GCN-EXCEPT: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -define amdgpu_ps float @test_no_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) { +define amdgpu_ps float @test_no_fold_canonicalize_fdiv_value_f32_no_ieee(float %arg0) { entry: - %v = fmul float %arg, 15.0 + %v = fdiv float %arg0, 15.0 %canonicalized = tail call float @llvm.canonicalize.f32(float %v) ret float %canonicalized } Index: test/CodeGen/AMDGPU/fmed3.ll =================================================================== --- test/CodeGen/AMDGPU/fmed3.ll +++ test/CodeGen/AMDGPU/fmed3.ll @@ -32,8 +32,9 @@ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 + %may.be.snan = call float @llvm.sqrt.f32(float %a) - %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %max = call float @llvm.maxnum.f32(float %may.be.snan, float 2.0) %med = call float @llvm.minnum.f32(float %max, float 4.0) store float %med, float addrspace(1)* %outgep @@ -50,8 +51,8 @@ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 - - %max = call float @llvm.maxnum.f32(float 2.0, float %a) + %may.be.snan = call float @llvm.sqrt.f32(float %a) + %max = call float @llvm.maxnum.f32(float 2.0, float %may.be.snan) %med = call float @llvm.minnum.f32(float 4.0, float %max) store float %med, float addrspace(1)* %outgep @@ -68,8 +69,9 @@ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 + %may.be.snan = call float @llvm.sqrt.f32(float %a) - %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %max = call float @llvm.maxnum.f32(float %may.be.snan, float 2.0) %med = call float @llvm.minnum.f32(float 4.0, float %max) store float %med, float addrspace(1)* %outgep @@ -133,8 +135,9 @@ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 + %may.be.snan = call float @llvm.sqrt.f32(float %a) - %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %max = call float @llvm.maxnum.f32(float %may.be.snan, float 2.0) %med = call float @llvm.minnum.f32(float %max, float 4.0) store float %med, float addrspace(1)* %outgep @@ -151,10 +154,11 @@ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 + %may.be.snan = call float @llvm.sqrt.f32(float %a) ; fmax_legacy - %cmp0 = fcmp ule float %a, 2.0 - %max = select i1 %cmp0, float 2.0, float %a + %cmp0 = fcmp ule float %may.be.snan, 2.0 + %max = select i1 %cmp0, float 2.0, float %may.be.snan ; fmin_legacy %cmp1 = fcmp uge float %max, 4.0 @@ -962,6 +966,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 declare float @llvm.fabs.f32(float) #0 +declare float @llvm.sqrt.f32(float) #0 declare float @llvm.minnum.f32(float, float) #0 declare float @llvm.maxnum.f32(float, float) #0 declare double @llvm.minnum.f64(double, double) #0