Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9756,25 +9756,31 @@ const MachineFunction &MF = DAG.getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo(); - // TODO: Check IEEE bit enabled? + SDValue Var = Op0.getOperand(0); EVT VT = Op0.getValueType(); - if (Info->getMode().DX10Clamp) { - // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the - // hardware fmed3 behavior converting to a min. - // FIXME: Should this be allowing -0.0? - if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0)) - return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0)); + // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the + // hardware fmed3 behavior converting to a min. + // FIXME: Should this be allowing -0.0? + if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0)) { + // If we're told NaNs won't happen on inner node make clamp and keep flags. + if (Op0->getFlags().hasNoNaNs() || DAG.getTarget().Options.NoNaNsFPMath) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Var, Op0->getFlags()); + + // When inner node returns QNaN (Var=SNaN and IEEE=true) outer node becomes + // min(QNaN, 1.0) = 1.0 and we can't use clamp. + bool XKnownNeverSNaN = DAG.isKnownNeverSNaN(Var); + if (DAG.isKnownNeverNaN(Var) || + (Info->getMode().DX10Clamp && + (XKnownNeverSNaN || (!XKnownNeverSNaN && !Info->getMode().IEEE)))) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Var); } // med3 for f16 is only available on gfx9+, and not available for v2f16. if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) { - // This isn't safe with signaling NaNs because in IEEE mode, min/max on a - // signaling NaN gives a quiet NaN. The quiet NaN input to the min would - // then give the other result, which is different from med3 with a NaN - // input. - SDValue Var = Op0.getOperand(0); - if (!DAG.isKnownNeverSNaN(Var)) - return SDValue(); + // min/max on a signaling NaN gives a quiet NaN. The quiet NaN input to the + // min/max would then give the other result. NaN input to med3(x, K0, K1) is + // equivalent to min(min(x, K0), K1). Since min and max (inner node) have + // same behaviour with NaN input min(max(x, K0), K1) is equivalent to med3. const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); @@ -9859,15 +9865,9 @@ return SDValue(); } -static bool isClampZeroToOne(SDValue A, SDValue B) { - if (ConstantFPSDNode *CA = dyn_cast(A)) { - if (ConstantFPSDNode *CB = dyn_cast(B)) { - // FIXME: Should this be allowing -0.0? - return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) || - (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0)); - } - } - +static bool isOperandExactlyValue(SDNode *N, unsigned Idx, double Value) { + if (ConstantFPSDNode *C = dyn_cast(N->getOperand(Idx))) + return C->isExactlyValue(Value); return false; } @@ -9876,42 +9876,56 @@ DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and - // NaNs. With a NaN input, the order of the operands may change the result. + // NaNs. med3_f32(a, b, c) is equivalent to min_f32(min_f32(a,b), c) when one + // of the a, b or c in NaN. Result is sensitive to the order of inputs in IEEE + // mode when one of them is SNaN. SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); - SDValue Src0 = N->getOperand(0); - SDValue Src1 = N->getOperand(1); - SDValue Src2 = N->getOperand(2); - - if (isClampZeroToOne(Src0, Src1)) { - // const_a, const_b, x -> clamp is safe in all cases including signaling - // nans. - // FIXME: Should this be allowing -0.0? - return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); + // Two, out of the three, operands need to be 0.0 and 1.0. + unsigned XIdx = 0, ZeroIdx = 1, OneIdx = 2; // Innital guess. + // Find index of the operand with 0.0. + if (!isOperandExactlyValue(N, ZeroIdx, 0.0)) { + std::swap(ZeroIdx, XIdx); + if (!isOperandExactlyValue(N, ZeroIdx, 0.0)) { + std::swap(ZeroIdx, OneIdx); + if (!isOperandExactlyValue(N, ZeroIdx, 0.0)) + return SDValue(); + } + } + // Find index of the operand with 1.0, remaining index is X. + if (!isOperandExactlyValue(N, OneIdx, 1.0)) { + std::swap(OneIdx, XIdx); + if (!isOperandExactlyValue(N, OneIdx, 1.0)) + return SDValue(); } - const MachineFunction &MF = DAG.getMachineFunction(); - const SIMachineFunctionInfo *Info = MF.getInfo(); - - // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother - // handling no dx10-clamp? - if (Info->getMode().DX10Clamp) { - // If NaNs is clamped to 0, we are free to reorder the inputs. - - if (isa(Src0) && !isa(Src1)) - std::swap(Src0, Src1); - - if (isa(Src1) && !isa(Src2)) - std::swap(Src1, Src2); + SDValue X = N->getOperand(XIdx); + // If we're told that NaNs won't happen make clamp and keep flags. + if (N->getFlags().hasNoNaNs() || DAG.getTarget().Options.NoNaNsFPMath) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, X, N->getFlags()); - if (isa(Src0) && !isa(Src1)) - std::swap(Src0, Src1); + // It is safe to clamp when X is not NaN. + // Based on v_min_f32 behaviour, IEEE and DX10Clamp settings: + // X=SNaN and IEEE=true: min(SNaN, Val) -> QNaN, min(QNaN, Val) -> Val + // med3 returns either QNaN, 1.0 or 0.0 deppending on the inputs. + // DX10Clamp=true: clamps SNaN to 0.0, ok when med3's operand(2) is 0.0. + // DX10Clamp=false: passes SNaN through, can't clamp. + // X=QNaN or (X=SNaN and IEEE=false): min(NaN, Val) -> Val + // min with NaN will return other operand, thus med3 returns 0.0. + // DX10Clamp=true: clamps NaN to 0.0, ok. + // DX10Clamp=false: passes NaN through, can't clamp. - if (isClampZeroToOne(Src1, Src2)) - return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0); - } + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + bool XKnownNeverSNaN = DAG.isKnownNeverSNaN(X); + bool IEEE = Info->getMode().IEEE; + if (DAG.isKnownNeverNaN(X) || isa(X) || + (Info->getMode().DX10Clamp && + ((!XKnownNeverSNaN && IEEE && ZeroIdx == 2) || XKnownNeverSNaN || + (!XKnownNeverSNaN && !IEEE)))) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, X); return SDValue(); } Index: llvm/test/CodeGen/AMDGPU/clamp-modifier.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -236,8 +236,9 @@ ; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_denorm_neg_lo: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], 1.0 op_sel_hi:[1,0]{{$}} -; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], [[ADD]], [[ADD]] neg_lo:[1,1] clamp{{$}} +; GFX9: v_pk_add_f16 [[A]], [[A]], 1.0 op_sel_hi:[1,0]{{$}} +; GFX9: v_pk_max_f16 v1, v1, 0 neg_lo:[1,0] +; GFX9: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid @@ -256,7 +257,8 @@ ; GCN-LABEL: {{^}}v_clamp_add_src_v2f16_denorm_neg_hi: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] ; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], 1.0 op_sel_hi:[1,0]{{$}} -; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], [[ADD]], [[ADD]] neg_hi:[1,1] clamp{{$}} +; GFX9: v_pk_max_f16 v1, v1, 0 neg_hi:[1,0] +; GFX9: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid @@ -311,7 +313,7 @@ ; GCN-LABEL: {{^}}v_no_clamp_add_packed_src_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] ; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[A]], 1.0 op_sel_hi:[1,0]{{$}} -; GFX9: v_max_f32_e64 [[CLAMP:v[0-9]+]], [[ADD]], [[ADD]] clamp{{$}} +; GFX9: v_med3_f32 [[MED3:v[0-9]+]], [[ADD]], 0, 1.0 define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid @@ -359,8 +361,10 @@ ; VI: v_or_b32 ; SI: v_cvt_pkrtz_f16_f32_e32 v0, v0, v1{{$}} -; SI-DAG: v_cvt_f32_f16_e64 v0, v0 clamp -; SI-DAG: v_cvt_f32_f16_e64 v1, v1 clamp +; SI-DAG: v_cvt_f32_f16_e32 v0, v0 +; SI-DAG: v_cvt_f32_f16_e32 v1, v1 +; SI-DAG: v_med3_f32 v0, v0, 0, 1.0 +; SI-DAG: v_med3_f32 v1, v1, 0, 1.0 define <2 x half> @v_clamp_cvt_pkrtz_src_v2f16_denorm(float %a, float %b) #0 { %add = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer) Index: llvm/test/CodeGen/AMDGPU/clamp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/clamp.ll +++ llvm/test/CodeGen/AMDGPU/clamp.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}v_clamp_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -119,9 +119,9 @@ ; GCN-LABEL: {{^}}v_clamp_f16: ; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]] -; GFX89: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} -; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}} +; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] +; SI: v_med3_f32 [[A]], [[A]], 0, 1.0 ; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -140,7 +140,8 @@ ; GFX89: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} ; FIXME: Better to fold neg into max -; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}} +; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] +; SI: v_med3_f32 [[A]], -[[A]], 0, 1.0 ; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -161,8 +162,9 @@ ; FIXME: Better to fold neg/abs into max -; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}} -; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]] +; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], |[[A]]| +; SI: v_med3_f32 [[MED3:v[0-9]+]], -[[CVT]], 0, 1.0 +; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[MED3]] define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid @@ -243,7 +245,7 @@ ; GCN-LABEL: {{^}}v_clamp_med3_aby_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, 0, 1.0, [[A]] define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -256,7 +258,7 @@ ; GCN-LABEL: {{^}}v_clamp_med3_bay_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, 0, [[A]] define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -269,7 +271,7 @@ ; GCN-LABEL: {{^}}v_clamp_med3_yab_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -295,7 +297,7 @@ ; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0 define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -319,6 +321,45 @@ ret void } +; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_snan_no_ieee: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_ayb_f32_snan_no_ieee(float addrspace(1)* %out, float addrspace(1)* %aptr) #5 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_nnan_med3_ayb_f32: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0 +define amdgpu_kernel void @v_clamp_nnan_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call nnan float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_nans_fp_math: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_nans_fp_math(float addrspace(1)* %out, float addrspace(1)* %aptr) #6 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + ; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32: ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0 define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 { @@ -385,8 +426,7 @@ ; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0 +; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 0.5 clamp{{$}} define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -418,9 +458,7 @@ ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GFX678: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] -; GFX9: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[A]], [[A]] -; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0 +; GCN: v_med3_f32 {{v[0-9]+}}, [[A]], 0, 1.0 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -435,8 +473,7 @@ ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0 +; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}} define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -452,7 +489,7 @@ ; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, 0, 1.0, [[A]] define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -465,7 +502,7 @@ ; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, 0, [[A]] define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -775,3 +812,5 @@ attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } +attributes #5 = { nounwind "amdgpu-dx10-clamp"="true" "amdgpu-ieee"="false" } +attributes #6 = { nounwind "no-nans-fp-math"="true" } Index: llvm/test/CodeGen/AMDGPU/known-never-snan.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/known-never-snan.ll +++ llvm/test/CodeGen/AMDGPU/known-never-snan.ll @@ -248,7 +248,6 @@ ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %b.nnan.add = fadd nnan float %b, 1.0 @@ -266,7 +265,6 @@ ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0 @@ -527,7 +525,6 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_med3_f32 v0, v0, v1, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %known.not.snan = call float @llvm.amdgcn.fmed3.f32(float %a, float %b, float %c) Index: llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -146,12 +146,15 @@ ; FIXME: Should be packed into 2 registers per argument? ; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt: ; GCN: s_waitcnt -; GFX9-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX9-DAG: v_mad_mixhi_f16 v{{[0-9]+}}, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX9-DAG: v_pk_max_f16 v1, v1, v1 clamp -; GFX9: v_mov_b32_e32 v0, v{{[0-9]+}} -; GFX9-NEXT: s_setpc_b64 +; GFX9-DAG: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX9-DAG: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX9-DAG: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-DAG: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX9-DAG: v_pk_max_f16 v1, v1, 0 +; GFX9-DAG: v_pk_max_f16 v0, v6, 0 +; GFX9-DAG: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX9-DAG: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX9: s_setpc_b64 define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { %src0.ext = fpext <3 x half> %src0 to <3 x float> %src1.ext = fpext <3 x half> %src1 to <3 x float> @@ -165,12 +168,14 @@ ; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX9-NEXT: v_pk_max_f16 v1, v7, 0 +; GFX9-NEXT: v_pk_max_f16 v0, v6, 0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { %src0.ext = fpext <4 x half> %src0 to <4 x float>