diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -658,7 +658,8 @@ if (auto *VTy = dyn_cast(Ty)) { unsigned Num = cast(VTy)->getNumElements(); unsigned Cost = thisT()->getArithmeticInstrCost( - Opcode, VTy->getScalarType(), CostKind); + Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo, Args, CxtI); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. return getScalarizationOverhead(VTy, Args) + Num * Cost; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -452,8 +452,8 @@ // implementation tries to generate legalize and scalarization costs. Maybe // we could hoist the scalarization code here? return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput, - Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + Opd1Info, Opd2Info, Opd1PropInfo, + Opd2PropInfo, Args, CxtI); } // Legalize the type. @@ -506,9 +506,20 @@ // i32 return QuarterRateCost * NElts * LT.first; } + case ISD::FMUL: + // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for + // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole + // fused operation. + if (!HasFP32Denormals && SLT == MVT::f32 && CxtI && CxtI->hasOneUse()) + if (const auto *FAdd = dyn_cast(*CxtI->user_begin())) { + const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode()); + if (OPC == ISD::FADD || OPC == ISD::FSUB) { + return TargetTransformInfo::TCC_Free; + } + } + LLVM_FALLTHROUGH; case ISD::FADD: case ISD::FSUB: - case ISD::FMUL: if (SLT == MVT::f64) return LT.first * NElts * get64BitInstrCost(); @@ -568,9 +579,8 @@ break; } - return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, - Opd2Info, - Opd1PropInfo, Opd2PropInfo); + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo, Args, CxtI); } // Return true if there's a potential benefit from using v2f16 instructions for diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll @@ -0,0 +1,48 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=FUSED,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=SLOW,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=FUSED,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=SLOW,ALL %s + +target triple = "amdgcn--" + +; ALL-LABEL: 'fmul_fadd_f32': +; FUSED: estimated cost of 0 for instruction: %mul = fmul float +; SLOW: estimated cost of 1 for instruction: %mul = fmul float +; ALL: estimated cost of 1 for instruction: %add = fadd float +define float @fmul_fadd_f32(float %r0, float %r1, float %r2) #0 { + %mul = fmul float %r0, %r1 + %add = fadd float %mul, %r2 + ret float %add +} + +; ALL-LABEL: 'fmul_fadd_v2f32': +; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float> +; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float> +; ALL: estimated cost of 2 for instruction: %add = fadd <2 x float> +define <2 x float> @fmul_fadd_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 { + %mul = fmul <2 x float> %r0, %r1 + %add = fadd <2 x float> %mul, %r2 + ret <2 x float> %add +} + +; ALL-LABEL: 'fmul_fsub_f32': +; FUSED: estimated cost of 0 for instruction: %mul = fmul float +; SLOW: estimated cost of 1 for instruction: %mul = fmul float +; ALL: estimated cost of 1 for instruction: %sub = fsub float +define float @fmul_fsub_f32(float %r0, float %r1, float %r2) #0 { + %mul = fmul float %r0, %r1 + %sub = fsub float %mul, %r2 + ret float %sub +} + +; ALL-LABEL: 'fmul_fsub_v2f32': +; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float> +; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float> +; ALL: estimated cost of 2 for instruction: %sub = fsub <2 x float> +define <2 x float> @fmul_fsub_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 { + %mul = fmul <2 x float> %r0, %r1 + %sub = fsub <2 x float> %mul, %r2 + ret <2 x float> %sub +} + +attributes #0 = { nounwind }