Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9772,6 +9772,16 @@ const SDLoc &SL, SDValue Op0, SDValue Op1) const { + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + unsigned Opc = Op0.getOpcode(); + bool IEEE = Info->getMode().IEEE; + // Based on IEEE setting false/true, v_max_f instruction behaves like + // ISD::FMAXNUM/ISD::FMAXNUM_IEEE respectively. Skip complicated checks when + // IEEE setting and opcode don't match. Retry after legalization when they do. + if ((Opc == ISD::FMAXNUM && IEEE) || (Opc == ISD::FMAXNUM_IEEE && !IEEE)) + return SDValue(); + ConstantFPSDNode *K1 = getSplatConstantFP(Op1); if (!K1) return SDValue(); @@ -9784,29 +9794,31 @@ if (K0->getValueAPF() > K1->getValueAPF()) return SDValue(); - const MachineFunction &MF = DAG.getMachineFunction(); - const SIMachineFunctionInfo *Info = MF.getInfo(); - - // TODO: Check IEEE bit enabled? + // Folding min(max(Val, 0.0), 1.0) into clamp(Val) is safe for non-NaN input. + // FMAXNUM_IEEE(SNaN, 0.0) = QNaN; FMINNUM_IEEE(QNaN, 1.0) = 1.0. + // FMAXNUM(NaN, 0.0) = FMAXNUM_IEEE(QNaN, 0.0) = 0.0 (returns non-NaN input) + // FMAX_LEGACY(NaN, 0.0) -> NaN >= 0.0 ? NaN : 0.0 = 0.0; min(0.0, 1.0) = 0.0. + // For the source, check for SNaN input to FMAXNUM_IEEE since only it doesn't + // evaluate to 0.0. For the destination we want to clamp NaNs to 0.0. + // When inner node is FMAXNUM_IEEE check if its result is known non-SNaN. This + // check for no-NaN flags first and then if input(Val) is known non-SNaN. EVT VT = Op0.getValueType(); + SDValue Val = Op0.getOperand(0); if (Info->getMode().DX10Clamp) { // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the // hardware fmed3 behavior converting to a min. // FIXME: Should this be allowing -0.0? if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0)) - return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0)); + if (Opc != ISD::FMAXNUM_IEEE || DAG.isKnownNeverSNaN(Op0)) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Val, Op0->getFlags()); } // med3 for f16 is only available on gfx9+, and not available for v2f16. if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) { - // This isn't safe with signaling NaNs because in IEEE mode, min/max on a - // signaling NaN gives a quiet NaN. The quiet NaN input to the min would - // then give the other result, which is different from med3 with a NaN - // input. - SDValue Var = Op0.getOperand(0); - if (!DAG.isKnownNeverSNaN(Var)) - return SDValue(); - + // Folding min(max(Val, K0), K1) into fmed3(Val, K0, K1) is safe for non-NaN + // input. fmed3(NaN, K0, K1) is equivalent to min(min(NaN, K0), K1), since + // inner nodes(max/min) have same behavior for 'NaN input as first operand' + // this is safe to fold for all inputs. const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if ((!K0->hasOneUse() || @@ -9814,7 +9826,7 @@ (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) { return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), - Var, SDValue(K0, 0), SDValue(K1, 0)); + Val, SDValue(K0, 0), SDValue(K1, 0), Op0->getFlags()); } } @@ -9890,15 +9902,9 @@ return SDValue(); } -static bool isClampZeroToOne(SDValue A, SDValue B) { - if (ConstantFPSDNode *CA = dyn_cast(A)) { - if (ConstantFPSDNode *CB = dyn_cast(B)) { - // FIXME: Should this be allowing -0.0? - return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) || - (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0)); - } - } - +static bool isOperandExactlyValue(SDNode *N, unsigned Idx, double Value) { + if (ConstantFPSDNode *C = dyn_cast(N->getOperand(Idx))) + return C->isExactlyValue(Value); return false; } @@ -9907,42 +9913,53 @@ DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and - // NaNs. With a NaN input, the order of the operands may change the result. + // NaNs. With a SNaN input in IEEE mode, the order of the operands may change + // the result because then fmed3(a, b, c) is equivalent to min(min(a, b), c). SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); - SDValue Src0 = N->getOperand(0); - SDValue Src1 = N->getOperand(1); - SDValue Src2 = N->getOperand(2); - - if (isClampZeroToOne(Src0, Src1)) { - // const_a, const_b, x -> clamp is safe in all cases including signaling - // nans. - // FIXME: Should this be allowing -0.0? - return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); + // Two, out of the three, operands need to be 0.0 and 1.0. + unsigned ValIdx = 0, ZeroIdx = 1, OneIdx = 2; // Initial guess. + // Find index of the operand with 0.0. + if (!isOperandExactlyValue(N, ZeroIdx, 0.0)) { + std::swap(ZeroIdx, ValIdx); + if (!isOperandExactlyValue(N, ZeroIdx, 0.0)) { + std::swap(ZeroIdx, OneIdx); + if (!isOperandExactlyValue(N, ZeroIdx, 0.0)) + return SDValue(); + } + } + // Find index of the operand with 1.0, remaining index is Val. + if (!isOperandExactlyValue(N, OneIdx, 1.0)) { + std::swap(OneIdx, ValIdx); + if (!isOperandExactlyValue(N, OneIdx, 1.0)) + return SDValue(); } + SDValue Val = N->getOperand(ValIdx); + // If we're told that NaNs won't happen assume that it is safe to clamp. + if (N->getFlags().hasNoNaNs() || DAG.getTarget().Options.NoNaNsFPMath) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Val, N->getFlags()); + + // Folding fmed3(Val, 0.0, 1.0) into clamp(Val). Consider all 6 operand + // permutations for fmed3. It is safe to clamp when Val is not NaN. + // For NaN input we consider result of min(min(a, b), c) based on min wrt IEEE + // mode and clamp(Val) wrt DX10Clamp. Val can be 0.0 or 1.0. Result depends on + // the result of the inner min and value of the last operand(named c above). + // SNaN input and IEEE=true: min(SNaN, Val) -> QNaN, min(QNaN, Val) -> Val + // fmed3 returns either QNaN, 1.0 or 0.0(when 0.0 is last operand(c)). + // X=QNaN or (X=SNaN and IEEE=false): min(NaN, Val) -> Val + // min with NaN will return the other operand thus fmed3 returns 0.0. + // It is not safe to fold with DX10Clamp=false since that lets NaN through and + // never matches fmed3 result. DX10Clamp=true clamps NaN to 0.0 and is safe to + // fold when fmed3 returns 0.0. const MachineFunction &MF = DAG.getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo(); - - // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother - // handling no dx10-clamp? - if (Info->getMode().DX10Clamp) { - // If NaNs is clamped to 0, we are free to reorder the inputs. - - if (isa(Src0) && !isa(Src1)) - std::swap(Src0, Src1); - - if (isa(Src1) && !isa(Src2)) - std::swap(Src1, Src2); - - if (isa(Src0) && !isa(Src1)) - std::swap(Src0, Src1); - - if (isClampZeroToOne(Src1, Src2)) - return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0); - } + if ((Info->getMode().DX10Clamp && + (!Info->getMode().IEEE || ZeroIdx == 2 || DAG.isKnownNeverSNaN(Val))) || + DAG.isKnownNeverNaN(Val)) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Val, N->getFlags()); return SDValue(); } Index: llvm/test/CodeGen/AMDGPU/clamp-modifier.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -359,8 +359,10 @@ ; VI: v_or_b32 ; SI: v_cvt_pkrtz_f16_f32_e32 v0, v0, v1{{$}} -; SI-DAG: v_cvt_f32_f16_e64 v0, v0 clamp -; SI-DAG: v_cvt_f32_f16_e64 v1, v1 clamp +; SI-DAG: v_cvt_f32_f16_e32 v0, v0 +; SI-DAG: v_cvt_f32_f16_e32 v1, v1 +; SI-DAG: v_mul_f32_e64 v1, 1.0, v1 clamp +; SI-DAG: v_mul_f32_e64 v0, 1.0, v0 clamp define <2 x half> @v_clamp_cvt_pkrtz_src_v2f16_denorm(float %a, float %b) #0 { %add = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer) Index: llvm/test/CodeGen/AMDGPU/clamp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/clamp.ll +++ llvm/test/CodeGen/AMDGPU/clamp.ll @@ -4,7 +4,8 @@ ; GCN-LABEL: {{^}}v_clamp_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GFX678: v_mul_f32_e64 v{{[0-9]+}}, 1.0, [[A]] clamp{{$}} +; GFX9: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -19,7 +20,8 @@ ; GCN-LABEL: {{^}}v_clamp_neg_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} +; GFX678: v_mul_f32_e64 v{{[0-9]+}}, -1.0, [[A]] clamp{{$}} +; GFX9: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -35,7 +37,8 @@ ; GCN-LABEL: {{^}}v_clamp_negabs_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}} +; GFX678: v_mul_f32_e64 v{{[0-9]+}}, -1.0, |[[A]]| clamp{{$}} +; GFX9: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}} define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -121,8 +124,9 @@ ; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]] ; GFX89: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} -; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}} -; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] +; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] +; SI: v_mul_f32_e64 [[FCANON:v[0-9]+]], 1.0, [[CVT]] clamp{{$}} +; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[FCANON]] define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid @@ -140,8 +144,9 @@ ; GFX89: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} ; FIXME: Better to fold neg into max -; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}} -; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] +; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] +; SI: v_mul_f32_e64 [[FCANON:v[0-9]+]], 1.0, [[CVT]] clamp{{$}} +; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[FCANON]] define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid @@ -161,8 +166,9 @@ ; FIXME: Better to fold neg/abs into max -; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}} -; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] +; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| +; SI: v_mul_f32_e64 [[FCANON:v[0-9]+]], 1.0, [[CVT]] clamp{{$}} +; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[FCANON]] define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid @@ -243,7 +249,7 @@ ; GCN-LABEL: {{^}}v_clamp_med3_aby_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, 0, 1.0, [[A]] define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -256,7 +262,7 @@ ; GCN-LABEL: {{^}}v_clamp_med3_bay_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, 0, [[A]] define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -269,7 +275,7 @@ ; GCN-LABEL: {{^}}v_clamp_med3_yab_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -295,7 +301,7 @@ ; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0 define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -319,6 +325,45 @@ ret void } +; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_snan_no_ieee: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_ayb_f32_snan_no_ieee(float addrspace(1)* %out, float addrspace(1)* %aptr) #5 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_nnan_med3_ayb_f32: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0 +define amdgpu_kernel void @v_clamp_nnan_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call nnan float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_nans_fp_math: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_nans_fp_math(float addrspace(1)* %out, float addrspace(1)* %aptr) #6 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + ; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32: ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0 define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 { @@ -370,7 +415,8 @@ } ; GCN-LABEL: {{^}}v_clamp_constant_snan_f32: -; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; GCN: v_mov_b32_e32 [[A:v[0-9]+]], 0x7f800001{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, 0, 1.0, [[A]] define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -385,8 +431,7 @@ ; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0 +; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 0.5 clamp{{$}} define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -418,9 +463,7 @@ ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GFX678: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] -; GFX9: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[A]], [[A]] -; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0 +; GCN: v_med3_f32 {{v[0-9]+}}, [[A]], 0, 1.0 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -435,8 +478,7 @@ ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0 +; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}} define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -452,7 +494,7 @@ ; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, 0, 1.0, [[A]] define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -465,7 +507,7 @@ ; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, 0, [[A]] define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -736,7 +778,8 @@ ; GCN-LABEL: {{^}}v_clamp_diff_source_f32: ; GCN: v_add_f32_e32 [[A:v[0-9]+]] ; GCN: v_add_f32_e32 [[B:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[B]] clamp{{$}} +; GCN: v_max3_f32 [[MAX3:v[0-9]+]], [[A]], [[B]], 0 +; GCN: v_min_f32_e32 v{{[0-9]+}}, 1.0, [[MAX3]] define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 0 @@ -775,3 +818,5 @@ attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } +attributes #5 = { nounwind "amdgpu-dx10-clamp"="true" "amdgpu-ieee"="false" } +attributes #6 = { nounwind "no-nans-fp-math"="true" } Index: llvm/test/CodeGen/AMDGPU/known-never-snan.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/known-never-snan.ll +++ llvm/test/CodeGen/AMDGPU/known-never-snan.ll @@ -179,8 +179,8 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_max_f32_e32 v0, v0, v1 -; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 %b.nnan.add = fadd nnan float %b, 1.0 @@ -195,9 +195,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_max_f32_e32 v0, v0, v1 -; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %b.nnan.add = fadd nnan float %b, 1.0 %known.not.snan = call float @llvm.maxnum.f32(float %a, float %b.nnan.add) @@ -211,9 +210,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_max_f32_e32 v0, v0, v1 -; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0 +; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b)