diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -78,6 +78,7 @@ AMDGPUTTIImpl CommonTTI; bool IsGraphicsShader; bool HasFP32Denormals; + bool HasFP64FP16Denormals; unsigned MaxVGPRs; const FeatureBitset InlineFeatureIgnoreList = { @@ -133,16 +134,18 @@ public: explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) - : BaseT(TM, F.getParent()->getDataLayout()), - ST(static_cast(TM->getSubtargetImpl(F))), - TLI(ST->getTargetLowering()), - CommonTTI(TM, F), - IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())), - HasFP32Denormals(AMDGPU::SIModeRegisterDefaults(F).allFP32Denormals()), - MaxVGPRs(ST->getMaxNumVGPRs( - std::max(ST->getWavesPerEU(F).first, - ST->getWavesPerEUForWorkGroup( - ST->getFlatWorkGroupSizes(F).second)))) {} + : BaseT(TM, F.getParent()->getDataLayout()), + ST(static_cast(TM->getSubtargetImpl(F))), + TLI(ST->getTargetLowering()), CommonTTI(TM, F), + IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())), + MaxVGPRs(ST->getMaxNumVGPRs( + std::max(ST->getWavesPerEU(F).first, + ST->getWavesPerEUForWorkGroup( + ST->getFlatWorkGroupSizes(F).second)))) { + AMDGPU::SIModeRegisterDefaults Mode(F); + HasFP32Denormals = Mode.allFP32Denormals(); + HasFP64FP16Denormals = Mode.allFP64FP16Denormals(); + } bool hasBranchDivergence() { return true; } bool useGPUDivergenceAnalysis() const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -510,11 +510,21 @@ // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole // fused operation. - if (!HasFP32Denormals && SLT == MVT::f32 && CxtI && CxtI->hasOneUse()) + if (CxtI && CxtI->hasOneUse()) if (const auto *FAdd = dyn_cast(*CxtI->user_begin())) { const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode()); if (OPC == ISD::FADD || OPC == ISD::FSUB) { - return TargetTransformInfo::TCC_Free; + if (SLT == MVT::f32 && !HasFP32Denormals) + return TargetTransformInfo::TCC_Free; + if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals) + return TargetTransformInfo::TCC_Free; + + // Estimate all types may be fused with contract/unsafe flags + const TargetOptions &Options = TLI->getTargetMachine().Options; + if (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath || + (FAdd->hasAllowContract() && CxtI->hasAllowContract())) + return TargetTransformInfo::TCC_Free; } } LLVM_FALLTHROUGH; diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll @@ -1,48 +1,157 @@ -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=FUSED,ALL %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=SLOW,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=FUSED,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=SLOW,ALL %s - -target triple = "amdgcn--" - -; ALL-LABEL: 'fmul_fadd_f32': -; FUSED: estimated cost of 0 for instruction: %mul = fmul float -; SLOW: estimated cost of 1 for instruction: %mul = fmul float -; ALL: estimated cost of 1 for instruction: %add = fadd float -define float @fmul_fadd_f32(float %r0, float %r1, float %r2) #0 { - %mul = fmul float %r0, %r1 - %add = fadd float %mul, %r2 - ret float %add -} - -; ALL-LABEL: 'fmul_fadd_v2f32': -; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float> -; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float> -; ALL: estimated cost of 2 for instruction: %add = fadd <2 x float> -define <2 x float> @fmul_fadd_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 { - %mul = fmul <2 x float> %r0, %r1 - %add = fadd <2 x float> %mul, %r2 - ret <2 x float> %add -} - -; ALL-LABEL: 'fmul_fsub_f32': -; FUSED: estimated cost of 0 for instruction: %mul = fmul float -; SLOW: estimated cost of 1 for instruction: %mul = fmul float -; ALL: estimated cost of 1 for instruction: %sub = fsub float -define float @fmul_fsub_f32(float %r0, float %r1, float %r2) #0 { - %mul = fmul float %r0, %r1 - %sub = fsub float %mul, %r2 - ret float %sub -} - -; ALL-LABEL: 'fmul_fsub_v2f32': -; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float> -; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float> -; ALL: estimated cost of 2 for instruction: %sub = fsub <2 x float> -define <2 x float> @fmul_fsub_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 { - %mul = fmul <2 x float> %r0, %r1 - %sub = fsub <2 x float> %mul, %r2 - ret <2 x float> %sub -} - -attributes #0 = { nounwind } +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,NOCONTRACT,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED32,FUSED16,NOCONTRACT,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED32,FUSED16,CONTRACT,ALL %s + +target triple = "amdgcn--" + +; ALL-LABEL: 'fmul_fadd_f32': +; FUSED: estimated cost of 0 for instruction: %mul = fmul float +; SLOW: estimated cost of 1 for instruction: %mul = fmul float +; ALL: estimated cost of 1 for instruction: %add = fadd float +define float @fmul_fadd_f32(float %r0, float %r1, float %r2) #0 { + %mul = fmul float %r0, %r1 + %add = fadd float %mul, %r2 + ret float %add +} + +; ALL-LABEL: 'fmul_fadd_contract_f32': +; ALL: estimated cost of 0 for instruction: %mul = fmul contract float +; ALL: estimated cost of 1 for instruction: %add = fadd contract float +define float @fmul_fadd_contract_f32(float %r0, float %r1, float %r2) #0 { + %mul = fmul contract float %r0, %r1 + %add = fadd contract float %mul, %r2 + ret float %add +} + +; ALL-LABEL: 'fmul_fadd_v2f32': +; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float> +; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float> +; ALL: estimated cost of 2 for instruction: %add = fadd <2 x float> +define <2 x float> @fmul_fadd_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 { + %mul = fmul <2 x float> %r0, %r1 + %add = fadd <2 x float> %mul, %r2 + ret <2 x float> %add +} + +; ALL-LABEL: 'fmul_fsub_f32': +; FUSED: estimated cost of 0 for instruction: %mul = fmul float +; SLOW: estimated cost of 1 for instruction: %mul = fmul float +; ALL: estimated cost of 1 for instruction: %sub = fsub float +define float @fmul_fsub_f32(float %r0, float %r1, float %r2) #0 { + %mul = fmul float %r0, %r1 + %sub = fsub float %mul, %r2 + ret float %sub +} + +; ALL-LABEL: 'fmul_fsub_v2f32': +; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float> +; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float> +; ALL: estimated cost of 2 for instruction: %sub = fsub <2 x float> +define <2 x float> @fmul_fsub_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 { + %mul = fmul <2 x float> %r0, %r1 + %sub = fsub <2 x float> %mul, %r2 + ret <2 x float> %sub +} + +; ALL-LABEL: 'fmul_fadd_f16': +; FUSED: estimated cost of 0 for instruction: %mul = fmul half +; SLOW: estimated cost of 1 for instruction: %mul = fmul half +; ALL: estimated cost of 1 for instruction: %add = fadd half +define half @fmul_fadd_f16(half %r0, half %r1, half %r2) #0 { + %mul = fmul half %r0, %r1 + %add = fadd half %mul, %r2 + ret half %add +} + +; ALL-LABEL: 'fmul_fadd_contract_f16': +; ALL: estimated cost of 0 for instruction: %mul = fmul contract half +; ALL: estimated cost of 1 for instruction: %add = fadd contract half +define half @fmul_fadd_contract_f16(half %r0, half %r1, half %r2) #0 { + %mul = fmul contract half %r0, %r1 + %add = fadd contract half %mul, %r2 + ret half %add +} + +; ALL-LABEL: 'fmul_fadd_v2f16': +; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x half> +; SLOW: estimated cost of 1 for instruction: %mul = fmul <2 x half> +; ALL: estimated cost of 1 for instruction: %add = fadd <2 x half> +define <2 x half> @fmul_fadd_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) #0 { + %mul = fmul <2 x half> %r0, %r1 + %add = fadd <2 x half> %mul, %r2 + ret <2 x half> %add +} + +; ALL-LABEL: 'fmul_fsub_f16': +; FUSED: estimated cost of 0 for instruction: %mul = fmul half +; SLOW: estimated cost of 1 for instruction: %mul = fmul half +; ALL: estimated cost of 1 for instruction: %sub = fsub half +define half @fmul_fsub_f16(half %r0, half %r1, half %r2) #0 { + %mul = fmul half %r0, %r1 + %sub = fsub half %mul, %r2 + ret half %sub +} + +; ALL-LABEL: 'fmul_fsub_v2f16': +; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x half> +; SLOW: estimated cost of 1 for instruction: %mul = fmul <2 x half> +; ALL: estimated cost of 1 for instruction: %sub = fsub <2 x half> +define <2 x half> @fmul_fsub_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) #0 { + %mul = fmul <2 x half> %r0, %r1 + %sub = fsub <2 x half> %mul, %r2 + ret <2 x half> %sub +} + +; ALL-LABEL: 'fmul_fadd_f64': +; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double +; NOCONTRACT: estimated cost of 3 for instruction: %mul = fmul double +; ALL: estimated cost of 3 for instruction: %add = fadd double +define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 { + %mul = fmul double %r0, %r1 + %add = fadd double %mul, %r2 + ret double %add +} + +; ALL-LABEL: 'fmul_fadd_contract_f64': +; ALL: estimated cost of 0 for instruction: %mul = fmul contract double +; ALL: estimated cost of 3 for instruction: %add = fadd contract double +define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 { + %mul = fmul contract double %r0, %r1 + %add = fadd contract double %mul, %r2 + ret double %add +} + +; ALL-LABEL: 'fmul_fadd_v2f64': +; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double> +; NOCONTRACT: estimated cost of 6 for instruction: %mul = fmul <2 x double> +; ALL: estimated cost of 6 for instruction: %add = fadd <2 x double> +define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 { + %mul = fmul <2 x double> %r0, %r1 + %add = fadd <2 x double> %mul, %r2 + ret <2 x double> %add +} + +; ALL-LABEL: 'fmul_fsub_f64': +; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double +; NOCONTRACT: estimated cost of 3 for instruction: %mul = fmul double +; ALL: estimated cost of 3 for instruction: %sub = fsub double +define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 { + %mul = fmul double %r0, %r1 + %sub = fsub double %mul, %r2 + ret double %sub +} + +; ALL-LABEL: 'fmul_fsub_v2f64': +; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double> +; NOCONTRACT: estimated cost of 6 for instruction: %mul = fmul <2 x double> +; ALL: estimated cost of 6 for instruction: %sub = fsub <2 x double> +define <2 x double> @fmul_fsub_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 { + %mul = fmul <2 x double> %r0, %r1 + %sub = fsub <2 x double> %mul, %r2 + ret <2 x double> %sub +} + +attributes #0 = { nounwind }