Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -245,6 +245,12 @@ [FeatureFP64FP16Denormals] >; +def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp", + "DX10Clamp", + "true", + "clamp modifier clamps NaNs to 0.0" +>; + def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", "FPExceptions", "true", Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -582,7 +582,7 @@ ProgInfo.IEEEMode = STM.enableIEEEBit(MF); // Make clamp modifier on NaN input returns 0. - ProgInfo.DX10Clamp = 1; + ProgInfo.DX10Clamp = STM.enableDX10Clamp(); const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); ProgInfo.ScratchSize = FrameInfo.getStackSize(); Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -70,6 +70,7 @@ bool shouldCombineMemoryType(EVT VT) const; SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, @@ -238,7 +239,11 @@ RETURN, DWORDADDR, FRACT, + + /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output + /// modifier behavior with dx10_enable. CLAMP, + // This is SETCC with the full mask result which is used for a compare with a // result bit per item in the wavefront. SETCC, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1012,22 +1012,29 @@ EVT VT = Op.getValueType(); switch (IntrinsicID) { - default: return Op; - case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name. - return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + default: return Op; + case AMDGPUIntrinsic::AMDGPU_clamp: { + // Deprecated in favor of emitting min/max combo or fmed3. + ConstantFPSDNode *CSrc1 = dyn_cast(Op.getOperand(2)); + ConstantFPSDNode *CSrc2 = dyn_cast(Op.getOperand(3)); + if (CSrc1 && CSrc2 && CSrc1->isZero() && CSrc2->isExactlyValue(1.0)) + return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_bfe_i32: - return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); + SDValue Max = DAG.getNode(ISD::FMAXNUM, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + return DAG.getNode(ISD::FMINNUM, DL, VT, Max, Op.getOperand(3)); + } + case AMDGPUIntrinsic::AMDGPU_bfe_i32: + return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); - case AMDGPUIntrinsic::AMDGPU_bfe_u32: - return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); + case AMDGPUIntrinsic::AMDGPU_bfe_u32: + return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); } } @@ -2445,6 +2452,28 @@ SN->getBasePtr(), SN->getMemOperand()); } +SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + ConstantFPSDNode *CSrc = dyn_cast(N->getOperand(0)); + if (!CSrc) + return SDValue(); + + const APFloat &F = CSrc->getValueAPF(); + APFloat Zero = APFloat::getZero(F.getSemantics()); + APFloat::cmpResult Cmp0 = F.compare(Zero); + if (Cmp0 == APFloat::cmpLessThan || + (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { + return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); + } + + APFloat One(F.getSemantics(), "1.0"); + APFloat::cmpResult Cmp1 = F.compare(One); + if (Cmp1 == APFloat::cmpGreaterThan) + return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); + + return SDValue(CSrc, 0); +} + /// Split the 64-bit value \p LHS into two 32-bit components, and perform the /// binary operation \p Opc to it with the corresponding constant operands. SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( @@ -3323,6 +3352,8 @@ return performLoadCombine(N, DCI); case ISD::STORE: return performStoreCombine(N, DCI); + case AMDGPUISD::CLAMP: + return performClampCombine(N, DCI); } return SDValue(); } Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -92,7 +92,7 @@ [SDNPCommutative, SDNPAssociative] >; -def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; +def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; // out = min(a, b) a and b are floats, where a nan comparison fails. def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -452,7 +452,7 @@ (outs rc:$dst), (ins rc:$src0), "CLAMP $dst, $src0", - [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] + [(set f32:$dst, (AMDGPUclamp f32:$src0))] >; class FABS : AMDGPUShaderInst < Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -100,6 +100,7 @@ bool FP32Denormals; bool FP64FP16Denormals; bool FPExceptions; + bool DX10Clamp; bool FlatForGlobal; bool UnalignedScratchAccess; bool UnalignedBufferAccess; @@ -289,10 +290,6 @@ return DumpCode; } - bool enableIEEEBit(const MachineFunction &MF) const { - return AMDGPU::isCompute(MF.getFunction()->getCallingConv()); - } - /// Return the amount of LDS that can be used that will not restrict the /// occupancy lower than WaveCount. unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, @@ -318,6 +315,14 @@ return FPExceptions; } + bool enableDX10Clamp() const { + return DX10Clamp; + } + + bool enableIEEEBit(const MachineFunction &MF) const { + return AMDGPU::isCompute(MF.getFunction()->getCallingConv()); + } + bool useFlatForGlobal() const { return FlatForGlobal; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -42,7 +42,7 @@ // for SI has the unhelpful behavior that it unsets everything else if you // disable it. - SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,"); + SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; @@ -89,6 +89,7 @@ FP32Denormals(false), FP64FP16Denormals(false), FPExceptions(false), + DX10Clamp(false), FlatForGlobal(false), UnalignedScratchAccess(false), UnalignedBufferAccess(false), Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -84,7 +84,10 @@ SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1) const; SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3988,8 +3988,10 @@ return DAG.isKnownNeverNaN(Op); } -static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1) { +SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Op0, + SDValue Op1) const { ConstantFPSDNode *K1 = dyn_cast(Op1); if (!K1) return SDValue(); @@ -4003,6 +4005,20 @@ if (Cmp == APFloat::cmpGreaterThan) return SDValue(); + // TODO: Check IEEE bit enabled? + EVT VT = K0->getValueType(0); + if (Subtarget->enableDX10Clamp()) { + // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the + // hardware fmed3 behavior converting to a min. + // FIXME: Should this be allowing -0.0? + if (K1->isExactlyValue(1.0) && K0->isZero()) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0)); + } + + // No med3 for f16, but clamp is possible. + if (VT == MVT::f16) + return SDValue(); + // This isn't safe with signaling NaNs because in IEEE mode, min/max on a // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then // give the other result, which is different from med3 with a NaN input. @@ -4067,7 +4083,9 @@ if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || (Opc == AMDGPUISD::FMIN_LEGACY && Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && - N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { + (N->getValueType(0) == MVT::f32 || + (N->getValueType(0) == MVT::f16 && Subtarget->has16BitInsts())) && + Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) return Res; } @@ -4075,6 +4093,60 @@ return SDValue(); } +static bool isClampZeroToOne(SDValue A, SDValue B) { + if (ConstantFPSDNode *CA = dyn_cast(A)) { + if (ConstantFPSDNode *CB = dyn_cast(B)) { + // FIXME: Should this be allowing -0.0? + return (CA->isZero() && CB->isExactlyValue(1.0)) || + (CA->isExactlyValue(1.0) && CB->isZero()); + } + } + + return false; +} + +// FIXME: Should only worry about snans for version with chain. +SDValue SITargetLowering::performFMed3Combine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and + // NaNs. With a NaN input, the order of the operands may change the result. + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + SDValue Src2 = N->getOperand(2); + + if (isClampZeroToOne(Src0, Src1)) { + // const_a, const_b, x -> clamp is safe in all cases including signaling + // nans. + // FIXME: Should this be allowing -0.0? + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); + } + + // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother + // handling no dx10-clamp? + if (Subtarget->enableDX10Clamp()) { + // If NaNs is clamped to 0, we are free to reorder the inputs. + + if (isa(Src0) && !isa(Src1)) + std::swap(Src0, Src1); + + if (isa(Src1) && !isa(Src2)) + std::swap(Src1, Src2); + + if (isa(Src0) && !isa(Src1)) + std::swap(Src0, Src1); + + if (isClampZeroToOne(Src1, Src2)) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0); + } + + return SDValue(); +} + unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const { @@ -4341,6 +4413,8 @@ case AMDGPUISD::CVT_F32_UBYTE2: case AMDGPUISD::CVT_F32_UBYTE3: return performCvtF32UByteNCombine(N, DCI); + case AMDGPUISD::FMED3: + return performFMed3Combine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -625,6 +625,7 @@ def DSTCLAMP { int NONE = 0; + int ENABLE = 1; } def DSTOMOD { Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -647,12 +647,20 @@ /********** Src & Dst modifiers **********/ /********** =================== **********/ -def : Pat < - (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), - (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod) + +// If denormals are not enabled, it only impacts the compare of the +// inputs. The output result is not flushed. +class ClampPat : Pat < + (vt (AMDGPUclamp + (VOP3Mods0Clamp vt:$src0, i32:$src0_modifiers, i32:$omod))), + (inst i32:$src0_modifiers, vt:$src0, + i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod) >; +// TODO: Does f64 support clamp? +def : ClampPat; +def : ClampPat; + /********** ================================ **********/ /********** Floating point absolute/negative **********/ /********** ================================ **********/ Index: test/CodeGen/AMDGPU/clamp.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/clamp.ll @@ -0,0 +1,522 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}v_clamp_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_neg_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %fneg.a = fsub float -0.0, %a + %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_negabs_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}} +define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %fabs.a = call float @llvm.fabs.f32(float %a) + %fneg.fabs.a = fsub float -0.0, %fabs.a + + %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; FIXME: Should this work? +; GCN-LABEL: {{^}}v_clamp_negzero_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %max = call float @llvm.maxnum.f32(float %a, float -0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]] +define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + store volatile float %max, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_clamp_f16: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; VI: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} + +; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] +; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}} +; SI: v_cvt_f16_f32_e32 +define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid + %a = load half, half addrspace(1)* %gep0 + %max = call half @llvm.maxnum.f16(half %a, half 0.0) + %med = call half @llvm.minnum.f16(half %max, half 1.0) + + store half %med, half addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_neg_f16: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; VI: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}} + +; FIXME: Better to fold neg into max +; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] +; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}} +; SI: v_cvt_f16_f32 +define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid + %a = load half, half addrspace(1)* %gep0 + %fneg.a = fsub half -0.0, %a + %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0) + %med = call half @llvm.minnum.f16(half %max, half 1.0) + + store half %med, half addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_negabs_f16: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; VI: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}} + +; FIXME: Better to fold neg/abs into max + +; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| +; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}} +; SI: v_cvt_f16_f32_e32 +define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid + %a = load half, half addrspace(1)* %gep0 + %fabs.a = call half @llvm.fabs.f16(half %a) + %fneg.fabs.a = fsub half -0.0, %fabs.a + + %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0) + %med = call half @llvm.minnum.f16(half %max, half 1.0) + + store half %med, half addrspace(1)* %out.gep + ret void +} + +; FIXME: Do f64 instructions support clamp? +; GCN-LABEL: {{^}}v_clamp_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: v_max_f64 +; GCN: v_min_f64 +define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + %max = call double @llvm.maxnum.f64(double %a, double 0.0) + %med = call double @llvm.minnum.f64(double %max, double 1.0) + + store double %med, double addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_neg_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: v_max_f64 +; GCN: v_min_f64 +define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + %fneg.a = fsub double -0.0, %a + %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0) + %med = call double @llvm.minnum.f64(double %max, double 1.0) + + store double %med, double addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_negabs_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: v_max_f64 +; GCN: v_min_f64 +define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + %fabs.a = call double @llvm.fabs.f64(double %a) + %fneg.fabs.a = fsub double -0.0, %fabs.a + + %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0) + %med = call double @llvm.minnum.f64(double %max, double 1.0) + + store double %med, double addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_aby_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_bay_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_yab_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_yba_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_bya_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0 +define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5 +define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}} +define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float)) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constant_snan_f32: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float)) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; --------------------------------------------------------------------- +; Test non-default behaviors enabling snans and disabling dx10_clamp +; --------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 +define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]] +define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %max = call float @llvm.maxnum.f32(float %a, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 +define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd nnan float %a, 1.0 + %max = call float @llvm.maxnum.f32(float %add, float 0.0) + %med = call float @llvm.minnum.f32(float %max, float 1.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 +define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0 +define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0 +define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0 +define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000 +define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000) + store float %med, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001 +define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float)) + store float %med, float addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 +declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 +declare double @llvm.fabs.f64(double) #1 +declare double @llvm.minnum.f64(double, double) #1 +declare double @llvm.maxnum.f64(double, double) #1 +declare half @llvm.fabs.f16(half) #1 +declare half @llvm.minnum.f16(half, half) #1 +declare half @llvm.maxnum.f16(half, half) #1 + + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" } +attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" } +attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" } Index: test/CodeGen/AMDGPU/hsa-fp-mode.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-fp-mode.ll +++ test/CodeGen/AMDGPU/hsa-fp-mode.ll @@ -60,9 +60,20 @@ ret void } +; GCN-LABEL: {{^}}test_no_dx10_clamp_vi: +; GCN: float_mode = 192 +; GCN: enable_dx10_clamp = 0 +; GCN: enable_ieee_mode = 1 +define void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 { + store float 0.0, float addrspace(1)* %out0 + store double 0.0, double addrspace(1)* %out1 + ret void +} + attributes #0 = { nounwind "target-cpu"="kaveri" } attributes #1 = { nounwind "target-cpu"="fiji" } attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-fp16-denormals" } attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-fp16-denormals" } attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" } attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" } +attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-dx10-clamp" } Index: test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll @@ -7,7 +7,7 @@ ; FUNC-LABEL: {{^}}clamp_0_1_f32: ; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0 clamp{{$}} +; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], [[ARG]], [[ARG]] clamp{{$}} ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm @@ -20,7 +20,7 @@ ; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32: ; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], |[[ARG]]|, 0 clamp{{$}} +; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], |[[ARG]]|, |[[ARG]]| clamp{{$}} ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind { @@ -32,7 +32,7 @@ ; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32: ; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], -[[ARG]], 0 clamp{{$}} +; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[ARG]], -[[ARG]] clamp{{$}} ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind { @@ -44,7 +44,7 @@ ; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32: ; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], -|[[ARG]]|, 0 clamp{{$}} +; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -|[[ARG]]|, -|[[ARG]]| clamp{{$}} ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {