Index: docs/AMDGPUUsage.rst =================================================================== --- docs/AMDGPUUsage.rst +++ docs/AMDGPUUsage.rst @@ -435,6 +435,14 @@ "amdgpu-waves-per-eu"="m,n" Specify the minimum and maximum number of waves per execution unit. Generated by the ``amdgpu_waves_per_eu`` CLANG attribute [CLANG-ATTR]_. + + "amdgpu-ieee" true/false. Specify whether the function expects + the IEEE field of the mode register to be set on entry. Overrides + the default for the calling convention. + "amdgpu-dx10-clamp" true/false. Specify whether the function expects + the DX10_CLAMP field of the mode register to be set on entry. Overrides + the default for the calling convention. + ======================================= ========================================================== Code Object Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -892,10 +892,11 @@ // register. ProgInfo.FloatMode = getFPMode(MF); - ProgInfo.IEEEMode = STM.enableIEEEBit(MF); + const SIModeRegisterDefaults Mode = MFI->getMode(); + ProgInfo.IEEEMode = Mode.IEEE; // Make clamp modifier on NaN input returns 0. - ProgInfo.DX10Clamp = STM.enableDX10Clamp(); + ProgInfo.DX10Clamp = Mode.DX10Clamp; unsigned LDSAlignShift; if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { Index: lib/Target/AMDGPU/AMDGPUFeatures.td =================================================================== --- lib/Target/AMDGPU/AMDGPUFeatures.td +++ lib/Target/AMDGPU/AMDGPUFeatures.td @@ -54,12 +54,6 @@ SubtargetFeature ; -def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp", - "DX10Clamp", - "true", - "clamp modifier clamps NaNs to 0.0" ->; - def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", "EnablePromoteAlloca", "true", Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -286,7 +286,6 @@ // Dynamially set bits that enable features. bool FP64FP16Denormals; - bool DX10Clamp; bool FlatForGlobal; bool AutoWaitcntBeforeBarrier; bool CodeObjectV3; @@ -531,14 +530,6 @@ return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool enableDX10Clamp() const { - return DX10Clamp; - } - - bool enableIEEEBit(const MachineFunction &MF) const { - return AMDGPU::isCompute(MF.getFunction().getCallingConv()); - } - bool useFlatForGlobal() const { return FlatForGlobal; } @@ -970,7 +961,6 @@ bool FMA; bool CaymanISA; bool CFALUBug; - bool DX10Clamp; bool HasVertexCache; bool R600ALUInst; bool FP64; Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -45,7 +45,7 @@ R600Subtarget & R600Subtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS) { - SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); + SmallString<256> FullFS("+promote-alloca,"); FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -77,7 +77,7 @@ // Similarly we want enable-prt-strict-null to be on by default and not to // unset everything else if it is disabled - SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); + SmallString<256> FullFS("+promote-alloca,+load-store-opt,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; @@ -164,7 +164,6 @@ HalfRate64Ops(false), FP64FP16Denormals(false), - DX10Clamp(false), FlatForGlobal(false), AutoWaitcntBeforeBarrier(false), CodeObjectV3(false), @@ -461,7 +460,6 @@ FMA(false), CaymanISA(false), CFALUBug(false), - DX10Clamp(false), HasVertexCache(false), R600ALUInst(false), FP64(false), Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -611,7 +611,7 @@ } bool GCNTTIImpl::areInlineCompatible(const Function *Caller, - const Function *Callee) const { + const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); const FeatureBitset &CallerBits = TM.getSubtargetImpl(*Caller)->getFeatureBits(); @@ -620,7 +620,14 @@ FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; - return ((RealCallerBits & RealCalleeBits) == RealCalleeBits); + if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) + return false; + + // FIXME: dx10_clamp can just take the caller setting, but there seems to be + // no way to support merge for backend defined attributes. + AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); + AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); + return CallerMode.isInlineCompatible(CalleeMode); } void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1120,7 +1120,8 @@ // omod is ignored by hardware if IEEE bit is enabled. omod also does not // correctly handle signed zeros. // - bool IsIEEEMode = ST->enableIEEEBit(MF); + // FIXME: Also need to check strictfp + bool IsIEEEMode = MFI->getMode().IEEE; bool HasNSZ = MFI->hasNoSignedZerosFPMath(); for (MachineBasicBlock *MBB : depth_first(&MF)) { Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4145,7 +4145,9 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + bool IsIEEEMode = Info->getMode().IEEE; // FIXME: Assert during eslection that this is only selected for // ieee_mode. Currently a combine can produce the ieee version for non-ieee @@ -8300,9 +8302,12 @@ if (Cmp == APFloat::cmpGreaterThan) return SDValue(); + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + // TODO: Check IEEE bit enabled? EVT VT = Op0.getValueType(); - if (Subtarget->enableDX10Clamp()) { + if (Info->getMode().DX10Clamp) { // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the // hardware fmed3 behavior converting to a min. // FIXME: Should this be allowing -0.0? @@ -8436,9 +8441,12 @@ return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); } + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother // handling no dx10-clamp? - if (Subtarget->enableDX10Clamp()) { + if (Info->getMode().DX10Clamp) { // If NaNs is clamped to 0, we are free to reorder the inputs. if (isa(Src0) && !isa(Src1)) @@ -9128,11 +9136,13 @@ if (!CSrc) return SDValue(); + const MachineFunction &MF = DCI.DAG.getMachineFunction(); const APFloat &F = CSrc->getValueAPF(); APFloat Zero = APFloat::getZero(F.getSemantics()); APFloat::cmpResult Cmp0 = F.compare(Zero); if (Cmp0 == APFloat::cmpLessThan || - (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { + (Cmp0 == APFloat::cmpUnordered && + MF.getInfo()->getMode().DX10Clamp)) { return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); } @@ -9967,7 +9977,10 @@ bool SNaN, unsigned Depth) const { if (Op.getOpcode() == AMDGPUISD::CLAMP) { - if (Subtarget->enableDX10Clamp()) + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + + if (Info->getMode().DX10Clamp) return true; // Clamped to 0. return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -148,6 +148,9 @@ AMDGPUFunctionArgInfo ArgInfo; + // State of MODE register, assumed FP mode. + AMDGPU::SIModeRegisterDefaults Mode; + // Graphics info. unsigned PSInputAddr = 0; unsigned PSInputEnable = 0; @@ -281,6 +284,10 @@ return SpillVGPRs; } + AMDGPU::SIModeRegisterDefaults getMode() const { + return Mode; + } + bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -28,6 +28,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), + Mode(MF.getFunction()), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -495,6 +495,46 @@ /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); + +// Track defaults for fields in the MODE registser. +struct SIModeRegisterDefaults { + + /// Floating point opcodes that support exception flag gathering quiet and + /// propagate sig- naling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10 + /// become IEEE 754- 2008 compliant due to signaling NaN propagation and + /// quieting. + bool IEEE : 1; + + /// Used by the vector ALU to force DX10-style treatment of NaNs: when set, + /// clamp NaN to zero; otherwise, pass NaN through. + bool DX10Clamp : 1; + + // TODO: FP mode fields + + SIModeRegisterDefaults() : + IEEE(true), + DX10Clamp(true) {} + + SIModeRegisterDefaults(const Function &F); + + static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) { + SIModeRegisterDefaults Mode; + Mode.DX10Clamp = true; + Mode.IEEE = AMDGPU::isCompute(CC); + return Mode; + } + + bool operator ==(const SIModeRegisterDefaults Other) const { + return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp; + } + + // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should + // be able to override. + bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const { + return *this == CalleeMode; + } +}; + } // end namespace AMDGPU } // end namespace llvm Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1002,6 +1002,19 @@ return true; } +SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { + *this = getDefaultForCallingConv(F.getCallingConv()); + + StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString(); + if (!IEEEAttr.empty()) + IEEE = IEEEAttr == "true"; + + StringRef DX10ClampAttr + = F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString(); + if (!DX10ClampAttr.empty()) + DX10Clamp = DX10ClampAttr == "true"; +} + namespace { struct SourceOfDivergence { Index: test/CodeGen/AMDGPU/amdgcn-ieee.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/amdgcn-ieee.ll @@ -0,0 +1,188 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}kernel_ieee_mode_default: +; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] +; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] +; GCN-NOT: v_mul_f32 +define amdgpu_kernel void @kernel_ieee_mode_default() #0 { + %val0 = load volatile float, float addrspace(1)* undef + %val1 = load volatile float, float addrspace(1)* undef + %min = call float @llvm.minnum.f32(float %val0, float %val1) + store volatile float %min, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}kernel_ieee_mode_on: +; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] +; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] +; GCN-NOT: v_mul_f32 +define amdgpu_kernel void @kernel_ieee_mode_on() #1 { + %val0 = load volatile float, float addrspace(1)* undef + %val1 = load volatile float, float addrspace(1)* undef + %min = call float @llvm.minnum.f32(float %val0, float %val1) + store volatile float %min, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}kernel_ieee_mode_off: +; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] +; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN-NOT: [[VAL0]] +; GCN-NOT: [[VAL1]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] +; GCN-NOT: v_mul_f32 +define amdgpu_kernel void @kernel_ieee_mode_off() #2 { + %val0 = load volatile float, float addrspace(1)* undef + %val1 = load volatile float, float addrspace(1)* undef + %min = call float @llvm.minnum.f32(float %val0, float %val1) + store volatile float %min, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}func_ieee_mode_default: +; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] +; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] +; GCN-NOT: v_mul_f32 +define void @func_ieee_mode_default() #0 { + %val0 = load volatile float, float addrspace(1)* undef + %val1 = load volatile float, float addrspace(1)* undef + %min = call float @llvm.minnum.f32(float %val0, float %val1) + store volatile float %min, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}func_ieee_mode_on: +; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] +; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] +; GCN-NOT: v_mul_f32 +define void @func_ieee_mode_on() #1 { + %val0 = load volatile float, float addrspace(1)* undef + %val1 = load volatile float, float addrspace(1)* undef + %min = call float @llvm.minnum.f32(float %val0, float %val1) + store volatile float %min, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}func_ieee_mode_off: +; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] +; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN-NOT: [[VAL0]] +; GCN-NOT: [[VAL1]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] +; GCN-NOT: v_mul_f32 +define void @func_ieee_mode_off() #2 { + %val0 = load volatile float, float addrspace(1)* undef + %val1 = load volatile float, float addrspace(1)* undef + %min = call float @llvm.minnum.f32(float %val0, float %val1) + store volatile float %min, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}cs_ieee_mode_default: +; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] +; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] +; GCN-NOT: v_mul_f32 +define amdgpu_cs void @cs_ieee_mode_default() #0 { + %val0 = load volatile float, float addrspace(1)* undef + %val1 = load volatile float, float addrspace(1)* undef + %min = call float @llvm.minnum.f32(float %val0, float %val1) + store volatile float %min, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}cs_ieee_mode_on: +; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] +; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] +; GCN-NOT: v_mul_f32 +define amdgpu_cs void @cs_ieee_mode_on() #1 { + %val0 = load volatile float, float addrspace(1)* undef + %val1 = load volatile float, float addrspace(1)* undef + %min = call float @llvm.minnum.f32(float %val0, float %val1) + store volatile float %min, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}cs_ieee_mode_off: +; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] +; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN-NOT: [[VAL0]] +; GCN-NOT: [[VAL1]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] +; GCN-NOT: v_mul_f32 +define amdgpu_cs void @cs_ieee_mode_off() #2 { + %val0 = load volatile float, float addrspace(1)* undef + %val1 = load volatile float, float addrspace(1)* undef + %min = call float @llvm.minnum.f32(float %val0, float %val1) + store volatile float %min, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}ps_ieee_mode_default: +; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] +; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN-NOT: [[VAL0]] +; GCN-NOT: [[VAL1]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] +; GCN-NOT: v_mul_f32 +define amdgpu_ps void @ps_ieee_mode_default() #0 { + %val0 = load volatile float, float addrspace(1)* undef + %val1 = load volatile float, float addrspace(1)* undef + %min = call float @llvm.minnum.f32(float %val0, float %val1) + store volatile float %min, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}ps_ieee_mode_on: +; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] +; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] +; GCN-NOT: v_mul_f32 +define amdgpu_ps void @ps_ieee_mode_on() #1 { + %val0 = load volatile float, float addrspace(1)* undef + %val1 = load volatile float, float addrspace(1)* undef + %min = call float @llvm.minnum.f32(float %val0, float %val1) + store volatile float %min, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}ps_ieee_mode_off: +; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] +; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] +; GCN-NOT: [[VAL0]] +; GCN-NOT: [[VAL1]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] +; GCN-NOT: v_mul_f32 +define amdgpu_ps void @ps_ieee_mode_off() #2 { + %val0 = load volatile float, float addrspace(1)* undef + %val1 = load volatile float, float addrspace(1)* undef + %min = call float @llvm.minnum.f32(float %val0, float %val1) + store volatile float %min, float addrspace(1)* undef + ret void +} + +declare float @llvm.minnum.f32(float, float) #3 + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-ieee"="true" } +attributes #2 = { nounwind "amdgpu-ieee"="false" } +attributes #3 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/clamp.ll =================================================================== --- test/CodeGen/AMDGPU/clamp.ll +++ test/CodeGen/AMDGPU/clamp.ll @@ -769,6 +769,6 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" } -attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" } -attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" } +attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="-fp-exceptions" "no-nans-fp-math"="false" } +attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "target-features"="+fp-exceptions" "no-nans-fp-math"="false" } +attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="+fp-exceptions" "no-nans-fp-math"="false" } Index: test/CodeGen/AMDGPU/hsa-fp-mode.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-fp-mode.ll +++ test/CodeGen/AMDGPU/hsa-fp-mode.ll @@ -70,10 +70,32 @@ ret void } +; GCN-LABEL: {{^}}test_no_ieee_mode_vi: +; GCN: float_mode = 192 +; GCN: enable_dx10_clamp = 1 +; GCN: enable_ieee_mode = 0 +define amdgpu_kernel void @test_no_ieee_mode_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #7 { + store float 0.0, float addrspace(1)* %out0 + store double 0.0, double addrspace(1)* %out1 + ret void +} + +; GCN-LABEL: {{^}}test_no_ieee_mode_no_dx10_clamp_vi: +; GCN: float_mode = 192 +; GCN: enable_dx10_clamp = 0 +; GCN: enable_ieee_mode = 0 +define amdgpu_kernel void @test_no_ieee_mode_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #8 { + store float 0.0, float addrspace(1)* %out0 + store double 0.0, double addrspace(1)* %out1 + ret void +} + attributes #0 = { nounwind "target-cpu"="kaveri" "target-features"="-code-object-v3" } attributes #1 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3" } attributes #2 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,+fp64-fp16-denormals" } attributes #3 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,-fp64-fp16-denormals" } attributes #4 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,+fp64-fp16-denormals" } attributes #5 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,-fp64-fp16-denormals" } -attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3,-dx10-clamp" } +attributes #6 = { nounwind "amdgpu-dx10-clamp"="false" "target-cpu"="fiji" "target-features"="-code-object-v3" } +attributes #7 = { nounwind "amdgpu-ieee"="false" "target-cpu"="fiji" "target-features"="-code-object-v3" } +attributes #8 = { nounwind "amdgpu-dx10-clamp"="false" "amdgpu-ieee"="false" "target-cpu"="fiji" "target-features"="-code-object-v3" } Index: test/Transforms/Inline/AMDGPU/inline-amdgpu-dx10.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/AMDGPU/inline-amdgpu-dx10.ll @@ -0,0 +1,107 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -inline < %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes='cgscc(inline)' < %s | FileCheck %s + +define i32 @func_default() #0 { + ret i32 0 +} + +define i32 @func_dx10_clamp_enabled() #1 { + ret i32 0 +} + +define i32 @func_dx10_clamp_disabled() #2 { + ret i32 0 +} + +; CHECK-LABEL: @default_call_default( +; CHECK-NEXT: ret i32 0 +define i32 @default_call_default() #0 { + %call = call i32 @func_default() + ret i32 %call +} + +; CHECK-LABEL: @dx10_clamp_enabled_call_default( +; CHECK-NEXT: ret i32 0 +define i32 @dx10_clamp_enabled_call_default() #1 { + %call = call i32 @func_default() + ret i32 %call +} + +; CHECK-LABEL: @dx10_clamp_enabled_call_dx10_clamp_enabled( +; CHECK-NEXT: ret i32 0 +define i32 @dx10_clamp_enabled_call_dx10_clamp_enabled() #1 { + %call = call i32 @func_dx10_clamp_enabled() + ret i32 %call +} + +; CHECK-LABEL: @dx10_clamp_enabled_call_dx10_clamp_disabled( +; CHECK-NEXT: call i32 @func_dx10_clamp_disabled() +define i32 @dx10_clamp_enabled_call_dx10_clamp_disabled() #1 { + %call = call i32 @func_dx10_clamp_disabled() + ret i32 %call +} + +; CHECK-LABEL: @dx10_clamp_disabled_call_default( +; CHECK-NEXT: call i32 @func_default() +define i32 @dx10_clamp_disabled_call_default() #2 { + %call = call i32 @func_default() + ret i32 %call +} + +; CHECK-LABEL: @dx10_clamp_disabled_call_dx10_clamp_enabled( +; CHECK-NEXT: call i32 @func_dx10_clamp_enabled() +define i32 @dx10_clamp_disabled_call_dx10_clamp_enabled() #2 { + %call = call i32 @func_dx10_clamp_enabled() + ret i32 %call +} + +; CHECK-LABEL: @dx10_clamp_disabled_call_dx10_clamp_disabled( +; CHECK-NEXT: ret i32 0 +define i32 @dx10_clamp_disabled_call_dx10_clamp_disabled() #2 { + %call = call i32 @func_dx10_clamp_disabled() + ret i32 %call +} + +; Shader calling a compute function +; CHECK-LABEL: @amdgpu_ps_default_call_default( +; CHECK-NEXT: call i32 @func_default() +define amdgpu_ps i32 @amdgpu_ps_default_call_default() #0 { + %call = call i32 @func_default() + ret i32 %call +} + +; Shader with dx10_clamp enabled calling a compute function. Default +; also implies ieee_mode, so this isn't inlinable. +; CHECK-LABEL: @amdgpu_ps_dx10_clamp_enabled_call_default( +; CHECK-NEXT: call i32 @func_default() +define amdgpu_ps i32 @amdgpu_ps_dx10_clamp_enabled_call_default() #1 { + %call = call i32 @func_default() + ret i32 %call +} + +; CHECK-LABEL: @amdgpu_ps_dx10_clamp_disabled_call_default( +; CHECK-NEXT: call i32 @func_default() +define amdgpu_ps i32 @amdgpu_ps_dx10_clamp_disabled_call_default() #2 { + %call = call i32 @func_default() + ret i32 %call +} + +; CHECK-LABEL: @amdgpu_ps_dx10_clamp_enabled_ieee_call_default( +; CHECK-NEXT: ret i32 0 +define amdgpu_ps i32 @amdgpu_ps_dx10_clamp_enabled_ieee_call_default() #3 { + %call = call i32 @func_default() + ret i32 %call +} + +; CHECK-LABEL: @amdgpu_ps_dx10_clamp_disabled_ieee_call_default( +; CHECK-NEXT: call i32 @func_default() +define amdgpu_ps i32 @amdgpu_ps_dx10_clamp_disabled_ieee_call_default() #4 { + %call = call i32 @func_default() + ret i32 %call +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-dx10-clamp"="true" } +attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" } +attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "amdgpu-ieee"="true" } +attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "amdgpu-ieee"="true" } Index: test/Transforms/Inline/AMDGPU/inline-amdgpu-ieee.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/AMDGPU/inline-amdgpu-ieee.ll @@ -0,0 +1,90 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -inline < %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes='cgscc(inline)' < %s | FileCheck %s + +define i32 @func_default() #0 { + ret i32 0 +} + +define i32 @func_ieee_enabled() #1 { + ret i32 0 +} + +define i32 @func_ieee_disabled() #2 { + ret i32 0 +} + +; CHECK-LABEL: @default_call_default( +; CHECK-NEXT: ret i32 0 +define i32 @default_call_default() #0 { + %call = call i32 @func_default() + ret i32 %call +} + +; CHECK-LABEL: @ieee_enabled_call_default( +; CHECK-NEXT: ret i32 0 +define i32 @ieee_enabled_call_default() #1 { + %call = call i32 @func_default() + ret i32 %call +} + +; CHECK-LABEL: @ieee_enabled_call_ieee_enabled( +; CHECK-NEXT: ret i32 0 +define i32 @ieee_enabled_call_ieee_enabled() #1 { + %call = call i32 @func_ieee_enabled() + ret i32 %call +} + +; CHECK-LABEL: @ieee_enabled_call_ieee_disabled( +; CHECK-NEXT: call i32 @func_ieee_disabled() +define i32 @ieee_enabled_call_ieee_disabled() #1 { + %call = call i32 @func_ieee_disabled() + ret i32 %call +} + +; CHECK-LABEL: @ieee_disabled_call_default( +; CHECK-NEXT: call i32 @func_default() +define i32 @ieee_disabled_call_default() #2 { + %call = call i32 @func_default() + ret i32 %call +} + +; CHECK-LABEL: @ieee_disabled_call_ieee_enabled( +; CHECK-NEXT: call i32 @func_ieee_enabled() +define i32 @ieee_disabled_call_ieee_enabled() #2 { + %call = call i32 @func_ieee_enabled() + ret i32 %call +} + +; CHECK-LABEL: @ieee_disabled_call_ieee_disabled( +; CHECK-NEXT: ret i32 0 +define i32 @ieee_disabled_call_ieee_disabled() #2 { + %call = call i32 @func_ieee_disabled() + ret i32 %call +} + +; Shader calling a compute function +; CHECK-LABEL: @amdgpu_ps_default_call_default( +; CHECK-NEXT: call i32 @func_default() +define amdgpu_ps i32 @amdgpu_ps_default_call_default() #0 { + %call = call i32 @func_default() + ret i32 %call +} + +; Shader with ieee enabled calling a compute function +; CHECK-LABEL: @amdgpu_ps_ieee_enabled_call_default( +; CHECK-NEXT: ret i32 0 +define amdgpu_ps i32 @amdgpu_ps_ieee_enabled_call_default() #1 { + %call = call i32 @func_default() + ret i32 %call +} + +; CHECK-LABEL: @amdgpu_ps_ieee_disabled_call_default( +; CHECK-NEXT: call i32 @func_default() +define amdgpu_ps i32 @amdgpu_ps_ieee_disabled_call_default() #2 { + %call = call i32 @func_default() + ret i32 %call +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-ieee"="true" } +attributes #2 = { nounwind "amdgpu-ieee"="false" }