Index: llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -69,15 +69,14 @@ // We want to use these instructions, and using fp32 denormals also causes // instructions to run at the double precision rate for the device so it's // probably best to just report no single precision denormals. -static uint32_t getFPMode(const MachineFunction &F) { - const GCNSubtarget& ST = F.getSubtarget(); - // TODO: Is there any real use for the flush in only / flush out only modes? +static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) { + // TODO: Is there any real use for the flush in only / flush out only modes? uint32_t FP32Denormals = - ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + Mode.FP32Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; uint32_t FP64Denormals = - ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + Mode.FP64FP16Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | @@ -1026,11 +1025,12 @@ ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks( &STM, ProgInfo.NumVGPRsForWavesPerEU); + const SIModeRegisterDefaults Mode = MFI->getMode(); + // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode // register. - ProgInfo.FloatMode = getFPMode(MF); + ProgInfo.FloatMode = getFPMode(Mode); - const SIModeRegisterDefaults Mode = MFI->getMode(); ProgInfo.IEEEMode = Mode.IEEE; // Make clamp modifier on NaN input returns 0. Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -69,6 +69,7 @@ Module *Mod = nullptr; const DataLayout *DL = nullptr; bool HasUnsafeFPMath = false; + bool HasFP32Denormals = false; /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. @@ -574,7 +575,6 @@ Value *NewFDiv = nullptr; - bool HasDenormals = ST->hasFP32Denormals(); if (VectorType *VT = dyn_cast(Ty)) { NewFDiv = UndefValue::get(VT); @@ -585,7 +585,7 @@ Value *DenEltI = Builder.CreateExtractElement(Den, I); Value *NewElt; - if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) { + if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) { NewElt = Builder.CreateFDiv(NumEltI, DenEltI); } else { NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); @@ -594,7 +594,7 @@ NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } } else { - if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals)) + if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals)) NewFDiv = Builder.CreateCall(Decl, { Num, Den }); } @@ -1033,6 +1033,7 @@ AC = &getAnalysis().getAssumptionCache(F); DA = &getAnalysis(); HasUnsafeFPMath = hasUnsafeFPMath(F); + HasFP32Denormals = ST->hasFP32Denormals(F); bool MadeChange = false; Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -127,6 +127,10 @@ // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can // make the right decision when generating code for different targets. const GCNSubtarget *Subtarget; + + // Default FP mode for the current function. + AMDGPU::SIModeRegisterDefaults Mode; + bool EnableLateStructurizeCFG; public: @@ -392,6 +396,7 @@ } #endif Subtarget = &MF.getSubtarget(); + Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -2103,7 +2108,7 @@ bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); - assert((IsFMA || !Subtarget->hasFP32Denormals()) && + assert((IsFMA || !Mode.FP32Denormals) && "fmad selected with denormals enabled"); // TODO: We can select this with f32 denormals enabled if all the sources are // converted from f16 (in which case fmad isn't legal). Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1584,8 +1584,11 @@ // float fqneg = -fq; SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); + MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUMachineFunction *MFI = MF.getInfo(); + // float fr = mad(fqneg, fb, fa); - unsigned OpCode = Subtarget->hasFP32Denormals() ? + unsigned OpCode = MFI->getMode().FP32Denormals ? (unsigned)AMDGPUISD::FMAD_FTZ : (unsigned)ISD::FMAD; SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); @@ -1666,8 +1669,11 @@ } if (isTypeLegal(MVT::i64)) { + MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + // Compute denominator reciprocal. - unsigned FMAD = Subtarget->hasFP32Denormals() ? + unsigned FMAD = MFI->getMode().FP32Denormals ? (unsigned)AMDGPUISD::FMAD_FTZ : (unsigned)ISD::FMAD; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -100,13 +100,16 @@ class AMDGPUPat : Pat, PredicateControl; -def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">; -def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">; -def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">; -def NoFP16Denormals : Predicate<"!Subtarget->hasFP16Denormals()">; -def NoFP32Denormals : Predicate<"!Subtarget->hasFP32Denormals()">; -def NoFP64Denormals : Predicate<"!Subtarget->hasFP64Denormals()">; +let RecomputePerFunction = 1 in { +def FP16Denormals : Predicate<"MF->getInfo()->getMode().FP64FP16Denormals">; +def FP32Denormals : Predicate<"MF->getInfo()->getMode().FP32Denormals">; +def FP64Denormals : Predicate<"MF->getInfo()->getMode().FP64FP16Denormals">; +def NoFP16Denormals : Predicate<"!MF->getInfo()->getMode().FP64FP16Denormals">; +def NoFP32Denormals : Predicate<"!MF->getInfo()->getMode().FP32Denormals">; +def NoFP64Denormals : Predicate<"!MF->getInfo()->getMode().FP64FP16Denormals">; def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; +} + def FMA : Predicate<"Subtarget->hasFMA()">; def InstFlag : OperandWithDefaultOps ; Index: llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -11,6 +11,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/MachineFunction.h" +#include "Utils/AMDGPUBaseInfo.h" namespace llvm { @@ -28,6 +29,9 @@ /// Number of bytes in the LDS that are being used. unsigned LDSSize; + // State of MODE register, assumed FP mode. + AMDGPU::SIModeRegisterDefaults Mode; + // Kernels + shaders. i.e. functions called by the driver and not called // by other functions. bool IsEntryFunction; @@ -53,6 +57,10 @@ return LDSSize; } + AMDGPU::SIModeRegisterDefaults getMode() const { + return Mode; + } + bool isEntryFunction() const { return IsEntryFunction; } Index: llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -18,6 +18,7 @@ LocalMemoryObjects(), ExplicitKernArgSize(0), LDSSize(0), + Mode(MF.getFunction(), MF.getSubtarget()), IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath), MemoryBound(false), Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -148,7 +148,12 @@ return HasMadMixInsts; } - bool hasFP32Denormals() const { + bool hasFP32Denormals(const Function &F) const { + // FIXME: This should not be a property of the subtarget. This should be a + // property with a default set by the calling convention which can be + // overridden by attributes. For now, use the subtarget feature as a + // placeholder attribute. The function arguments only purpose is to + // discourage use without a function context until this is removed. return FP32Denormals; } @@ -612,11 +617,17 @@ unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const; - bool hasFP16Denormals() const { + /// Alias for hasFP64FP16Denormals + bool hasFP16Denormals(const Function &F) const { return FP64FP16Denormals; } - bool hasFP64Denormals() const { + /// Alias for hasFP64FP16Denormals + bool hasFP64Denormals(const Function &F) const { + return FP64FP16Denormals; + } + + bool hasFP64FP16Denormals(const Function &F) const { return FP64FP16Denormals; } Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -46,7 +46,7 @@ Triple TargetTriple; - const TargetSubtargetInfo *ST; + const GCNSubtarget *ST; const TargetLoweringBase *TLI; const TargetSubtargetInfo *getST() const { return ST; } @@ -73,6 +73,7 @@ const AMDGPUTargetLowering *TLI; AMDGPUTTIImpl CommonTTI; bool IsGraphicsShader; + bool HasFP32Denormals; const FeatureBitset InlineFeatureIgnoreList = { // Codegen control options which don't matter. @@ -131,7 +132,8 @@ ST(static_cast(TM->getSubtargetImpl(F))), TLI(ST->getTargetLowering()), CommonTTI(TM, F), - IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {} + IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())), + HasFP32Denormals(ST->hasFP32Denormals(F)) { } bool hasBranchDivergence() { return true; } Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -412,7 +412,7 @@ if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) { // TODO: This is more complicated, unsafe flags etc. - if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) || + if ((SLT == MVT::f32 && !HasFP32Denormals) || (SLT == MVT::f16 && ST->has16BitInsts())) { return LT.first * getQuarterRateInstrCost() * NElts; } @@ -431,7 +431,7 @@ if (SLT == MVT::f32 || SLT == MVT::f16) { int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); - if (!ST->hasFP32Denormals()) { + if (!HasFP32Denormals) { // FP mode switches. Cost += 2 * getFullRateInstrCost(); } @@ -671,10 +671,13 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); - const FeatureBitset &CallerBits = - TM.getSubtargetImpl(*Caller)->getFeatureBits(); - const FeatureBitset &CalleeBits = - TM.getSubtargetImpl(*Callee)->getFeatureBits(); + const GCNSubtarget *CallerST + = static_cast(TM.getSubtargetImpl(*Caller)); + const GCNSubtarget *CalleeST + = static_cast(TM.getSubtargetImpl(*Callee)); + + const FeatureBitset &CallerBits = CallerST->getFeatureBits(); + const FeatureBitset &CalleeBits = CalleeST->getFeatureBits(); FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; @@ -683,8 +686,8 @@ // FIXME: dx10_clamp can just take the caller setting, but there seems to be // no way to support merge for backend defined attributes. - AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); - AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); + AMDGPU::SIModeRegisterDefaults CallerMode(*Caller, *CallerST); + AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST); return CallerMode.isInlineCompatible(CalleeMode); } Index: llvm/lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -223,10 +223,8 @@ setOperationAction(ISD::FMA, MVT::f64, Expand); } - // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we - // need it for R600. - if (!Subtarget->hasFP32Denormals()) - setOperationAction(ISD::FMAD, MVT::f32, Legal); + // FIXME: May need no denormals check + setOperationAction(ISD::FMAD, MVT::f32, Legal); if (!Subtarget->hasBFI()) { // fcopysign can be done in a single instruction with BFI. Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1359,8 +1359,8 @@ case AMDGPU::V_MUL_F32_e64: case AMDGPU::V_MUL_F16_e64: { // If output denormals are enabled, omod is ignored. - if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) || - (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals())) + if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32Denormals) || + (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16Denormals)) return std::make_pair(nullptr, SIOutMods::NONE); const MachineOperand *RegOp = nullptr; @@ -1389,8 +1389,8 @@ case AMDGPU::V_ADD_F32_e64: case AMDGPU::V_ADD_F16_e64: { // If output denormals are enabled, omod is ignored. - if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) || - (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals())) + if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32Denormals) || + (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16Denormals)) return std::make_pair(nullptr, SIOutMods::NONE); // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -393,7 +393,7 @@ bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; - bool denormalsEnabledForType(EVT VT) const; + bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const; bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -100,6 +100,16 @@ cl::desc("Do not align and prefetch loops"), cl::init(false)); +static bool hasFP32Denormals(const MachineFunction &MF) { + const SIMachineFunctionInfo *Info = MF.getInfo(); + return Info->getMode().FP32Denormals; +} + +static bool hasFP64FP16Denormals(const MachineFunction &MF) { + const SIMachineFunctionInfo *Info = MF.getInfo(); + return Info->getMode().FP64FP16Denormals; +} + static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { @@ -363,9 +373,10 @@ setOperationAction(ISD::FLOG10, MVT::f16, Custom); } - // v_mad_f32 does not support denormals according to some sources. - if (!Subtarget->hasFP32Denormals()) - setOperationAction(ISD::FMAD, MVT::f32, Legal); + // v_mad_f32 does not support denormals. We report it as unconditionally + // legal, and the context where it is formed will disallow it when fp32 + // denormals are enabled. + setOperationAction(ISD::FMAD, MVT::f32, Legal); if (!Subtarget->hasBFI()) { // fcopysign can be done in a single instruction with BFI. @@ -503,7 +514,7 @@ // F16 - VOP3 Actions. setOperationAction(ISD::FMA, MVT::f16, Legal); - if (!Subtarget->hasFP16Denormals() && STI.hasMadF16()) + if (STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { @@ -765,8 +776,9 @@ EVT DestVT, EVT SrcVT) const { return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && - DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && - SrcVT.getScalarType() == MVT::f16; + DestVT.getScalarType() == MVT::f32 && + SrcVT.getScalarType() == MVT::f16 && + !hasFP32Denormals(DAG.getMachineFunction()); } bool SITargetLowering::isShuffleMaskLegal(ArrayRef, EVT) const { @@ -3923,7 +3935,7 @@ // mad available which returns the same result as the separate operations // which we should prefer over fma. We can't use this if we want to support // denormals, so only report this in these cases. - if (Subtarget->hasFP32Denormals()) + if (hasFP32Denormals(MF)) return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. @@ -3932,7 +3944,7 @@ case MVT::f64: return true; case MVT::f16: - return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals(); + return Subtarget->has16BitInsts() && hasFP64FP16Denormals(MF); default: break; } @@ -3946,9 +3958,11 @@ // v_mad_f32/v_mac_f32 do not support denormals. EVT VT = N->getValueType(0); if (VT == MVT::f32) - return !Subtarget->hasFP32Denormals(); - if (VT == MVT::f16) - return !Subtarget->hasFP16Denormals() && Subtarget->hasMadF16(); + return !hasFP32Denormals(DAG.getMachineFunction()); + if (VT == MVT::f16) { + return Subtarget->hasMadF16() && + !hasFP64FP16Denormals(DAG.getMachineFunction()); + } return false; } @@ -7536,7 +7550,7 @@ const SDNodeFlags Flags = Op->getFlags(); bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal(); - if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals()) + if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) return SDValue(); if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { @@ -7679,7 +7693,7 @@ static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG, const SDLoc &SL, const GCNSubtarget *ST) { assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); - int DPDenormModeDefault = ST->hasFP64Denormals() + int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction()) ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; @@ -7715,7 +7729,9 @@ (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); - if (!Subtarget->hasFP32Denormals()) { + const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction()); + + if (!HasFP32Denormals) { SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue EnableDenorm; @@ -7759,8 +7775,7 @@ SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled, Fma3); - if (!Subtarget->hasFP32Denormals()) { - + if (!HasFP32Denormals) { SDValue DisableDenorm; if (Subtarget->hasDenormModeInst()) { const SDValue DisableDenormValue = @@ -8734,7 +8749,7 @@ auto F = CFP->getValueAPF(); if (F.isNaN() && F.isSignaling()) return false; - return !F.isDenormal() || denormalsEnabledForType(Op.getValueType()); + return !F.isDenormal() || denormalsEnabledForType(DAG, Op.getValueType()); } // If source is a result of another standard FP operation it is already in @@ -8803,7 +8818,7 @@ // snans will be quieted, so we only need to worry about denormals. if (Subtarget->supportsMinMaxDenormModes() || - denormalsEnabledForType(Op.getValueType())) + denormalsEnabledForType(DAG, Op.getValueType())) return true; // Flushing may be required. @@ -8875,7 +8890,7 @@ LLVM_FALLTHROUGH; } default: - return denormalsEnabledForType(Op.getValueType()) && + return denormalsEnabledForType(DAG, Op.getValueType()) && DAG.isKnownNeverSNaN(Op); } @@ -8886,7 +8901,7 @@ SDValue SITargetLowering::getCanonicalConstantFP( SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { // Flush denormals to 0 if not enabled. - if (C.isDenormal() && !denormalsEnabledForType(VT)) + if (C.isDenormal() && !denormalsEnabledForType(DAG, VT)) return DAG.getConstantFP(0.0, SL, VT); if (C.isNaN()) { @@ -9424,8 +9439,8 @@ // Only do this if we are not trying to support denormals. v_mad_f32 does not // support denormals ever. - if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || - (VT == MVT::f16 && !Subtarget->hasFP16Denormals() && + if (((VT == MVT::f32 && !hasFP32Denormals(DAG.getMachineFunction())) || + (VT == MVT::f16 && !hasFP64FP16Denormals(DAG.getMachineFunction()) && getSubtarget()->hasMadF16())) && isOperationLegal(ISD::FMAD, VT)) return ISD::FMAD; @@ -10891,14 +10906,14 @@ return false; } -bool SITargetLowering::denormalsEnabledForType(EVT VT) const { +bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, + EVT VT) const { switch (VT.getScalarType().getSimpleVT().SimpleTy) { case MVT::f32: - return Subtarget->hasFP32Denormals(); + return hasFP32Denormals(DAG.getMachineFunction()); case MVT::f64: - return Subtarget->hasFP64Denormals(); case MVT::f16: - return Subtarget->hasFP16Denormals(); + return hasFP64FP16Denormals(DAG.getMachineFunction()); default: return false; } Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -340,9 +340,6 @@ AMDGPUFunctionArgInfo ArgInfo; - // State of MODE register, assumed FP mode. - AMDGPU::SIModeRegisterDefaults Mode; - // Graphics info. unsigned PSInputAddr = 0; unsigned PSInputEnable = 0; @@ -515,10 +512,6 @@ : I->second.Lanes[Lane]; } - AMDGPU::SIModeRegisterDefaults getMode() const { - return Mode; - } - bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -28,7 +28,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), - Mode(MF.getFunction()), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -676,7 +676,8 @@ FP32Denormals(true), FP64FP16Denormals(true) {} - SIModeRegisterDefaults(const Function &F); + // FIXME: Should not depend on the subtarget + SIModeRegisterDefaults(const Function &F, const GCNSubtarget &ST); static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) { const bool IsCompute = AMDGPU::isCompute(CC); @@ -695,10 +696,23 @@ FP64FP16Denormals == Other.FP64FP16Denormals; } + /// Returns true if a flag is compatible if it's enabled in the callee, but + /// disabled in the caller. + static bool oneWayCompatible(bool CallerMode, bool CalleeMode) { + return CallerMode == CalleeMode || (CallerMode && !CalleeMode); + } + // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should // be able to override. bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const { - return *this == CalleeMode; + if (DX10Clamp != CalleeMode.DX10Clamp) + return false; + if (IEEE != CalleeMode.IEEE) + return false; + + // Allow inlining denormals enabled into denormals flushed functions. + return oneWayCompatible(FP64FP16Denormals, CalleeMode.FP64FP16Denormals) && + oneWayCompatible(FP32Denormals, CalleeMode.FP32Denormals); } }; Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1302,7 +1302,8 @@ return true; } -SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { +SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F, + const GCNSubtarget &ST) { *this = getDefaultForCallingConv(F.getCallingConv()); StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString(); @@ -1313,6 +1314,9 @@ = F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString(); if (!DX10ClampAttr.empty()) DX10Clamp = DX10ClampAttr == "true"; + + FP32Denormals = ST.hasFP32Denormals(F); + FP64FP16Denormals = ST.hasFP64FP16Denormals(F); } namespace { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir @@ -37,8 +37,8 @@ liveins: $vgpr0 ; GFX9-LABEL: name: fcanonicalize_f16_flush ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]] + ; GFX9: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, 15360, 0, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FCANONICALIZE %1 @@ -60,8 +60,8 @@ ; GFX9-LABEL: name: fcanonicalize_f32_denorm ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 0, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] + ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_FCANONICALIZE %0 S_ENDPGM 0, implicit %1 @@ -170,8 +170,8 @@ ; GFX9-LABEL: name: fcanonicalize_f64_flush ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F64_]] + ; GFX9: [[V_MUL_F64_:%[0-9]+]]:vreg_64 = V_MUL_F64 0, 4607182418800017408, 0, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F64_]] %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_FCANONICALIZE %0 S_ENDPGM 0, implicit %1 @@ -191,8 +191,8 @@ liveins: $vgpr0 ; GFX9-LABEL: name: fcanonicalize_fabs_f32_denorm ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] + ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_FABS %0 %2:vgpr(s32) = G_FCANONICALIZE %1 @@ -237,8 +237,8 @@ liveins: $vgpr0 ; GFX9-LABEL: name: fcanonicalize_fneg_f32_denorm ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 3212836864, 0, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] + ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_FNEG %0 %2:vgpr(s32) = G_FCANONICALIZE %1 @@ -283,8 +283,8 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX9: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[COPY]], implicit $exec - ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] + ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[V_XOR_B32_e32_]], 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_FNEG %0 %2:vgpr(s32) = G_FABS %1