Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -263,6 +263,18 @@ /// individual classes of instructions would be better. unsigned getInliningThresholdMultiplier() const; + /// \returns Vector bonus in percent. + /// + /// Vector bonuses: We want to more aggressively inline vector-dense kernels + /// and apply this bonus based on the percentage of vector instructions. A + /// bonus is applied if the vector instructions exceed 50% and half that amount + /// is applied if it exceeds 10%. Note that these bonuses are some what + /// arbitrary and evolved over time by accident as much as because they are + /// principled bonuses. + /// FIXME: It would be nice to base the bonus values on something more + /// scientific. A target may has no bonus on vector instructions. + int getInlinerVectorBonusPercent() const; + /// Estimate the cost of an intrinsic when lowered. /// /// Mirrors the \c getCallCost method but uses an intrinsic identifier. @@ -1128,6 +1140,7 @@ virtual int getCallCost(const Function *F, ArrayRef Arguments, const User *U) = 0; virtual unsigned getInliningThresholdMultiplier() = 0; + virtual int getInlinerVectorBonusPercent() = 0; virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef ParamTys, const User *U) = 0; virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, @@ -1351,6 +1364,9 @@ unsigned getInliningThresholdMultiplier() override { return Impl.getInliningThresholdMultiplier(); } + int getInlinerVectorBonusPercent() override { + return Impl.getInlinerVectorBonusPercent(); + } int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef ParamTys, const User *U = nullptr) override { return Impl.getIntrinsicCost(IID, RetTy, ParamTys, U); Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -140,6 +140,8 @@ unsigned getInliningThresholdMultiplier() { return 1; } + int getInlinerVectorBonusPercent() { return 150; } + unsigned getMemcpyCost(const Instruction *I) { return TTI::TCC_Expensive; } Index: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h @@ -427,6 +427,8 @@ unsigned getInliningThresholdMultiplier() { return 1; } + int getInlinerVectorBonusPercent() { return 150; } + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // This unrolling functionality is target independent, but to provide some Index: llvm/trunk/lib/Analysis/InlineCost.cpp =================================================================== --- llvm/trunk/lib/Analysis/InlineCost.cpp +++ llvm/trunk/lib/Analysis/InlineCost.cpp @@ -880,15 +880,6 @@ // basic block at the given callsite context. This is speculatively applied // and withdrawn if more than one basic block is seen. // - // Vector bonuses: We want to more aggressively inline vector-dense kernels - // and apply this bonus based on the percentage of vector instructions. A - // bonus is applied if the vector instructions exceed 50% and half that amount - // is applied if it exceeds 10%. Note that these bonuses are some what - // arbitrary and evolved over time by accident as much as because they are - // principled bonuses. - // FIXME: It would be nice to base the bonus values on something more - // scientific. - // // LstCallToStaticBonus: This large bonus is applied to ensure the inlining // of the last call to a static function as inlining such functions is // guaranteed to reduce code size. @@ -896,7 +887,7 @@ // These bonus percentages may be set to 0 based on properties of the caller // and the callsite. int SingleBBBonusPercent = 50; - int VectorBonusPercent = 150; + int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; // Lambda to set all the above bonus and bonus percentages to 0. Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp +++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp @@ -176,6 +176,10 @@ return TTIImpl->getInliningThresholdMultiplier(); } +int TargetTransformInfo::getInlinerVectorBonusPercent() const { + return TTIImpl->getInlinerVectorBonusPercent(); +} + int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands) const { return TTIImpl->getGEPCost(PointeeType, Ptr, Operands); Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInline.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInline.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -39,7 +39,7 @@ #define DEBUG_TYPE "inline" static cl::opt -ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), +ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500), cl::desc("Cost of alloca argument")); // If the amount of scratch memory to eliminate exceeds our ability to allocate Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -191,7 +191,9 @@ bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - unsigned getInliningThresholdMultiplier() { return 9; } + unsigned getInliningThresholdMultiplier() { return 7; } + + int getInlinerVectorBonusPercent() { return 0; } int getArithmeticReductionCost(unsigned Opcode, Type *Ty, Index: llvm/trunk/test/CodeGen/AMDGPU/amdgpu-inline.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/amdgpu-inline.ll +++ llvm/trunk/test/CodeGen/AMDGPU/amdgpu-inline.ll @@ -28,15 +28,8 @@ define coldcc void @foo_private_ptr2(float addrspace(5)* nocapture %p1, float addrspace(5)* nocapture %p2) { entry: %tmp1 = load float, float addrspace(5)* %p1, align 4 - %cmp = fcmp ogt float %tmp1, 1.000000e+00 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry %div = fdiv float 2.000000e+00, %tmp1 store float %div, float addrspace(5)* %p2, align 4 - br label %if.end - -if.end: ; preds = %if.then, %entry ret void } Index: llvm/trunk/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll =================================================================== --- llvm/trunk/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll +++ llvm/trunk/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll @@ -0,0 +1,31 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-inline --inline-threshold=1 < %s | FileCheck %s + +define hidden <16 x i32> @div_vecbonus(<16 x i32> %x, <16 x i32> %y) { +entry: + %div.1 = udiv <16 x i32> %x, %y + %div.2 = udiv <16 x i32> %div.1, %y + %div.3 = udiv <16 x i32> %div.2, %y + %div.4 = udiv <16 x i32> %div.3, %y + %div.5 = udiv <16 x i32> %div.4, %y + %div.6 = udiv <16 x i32> %div.5, %y + %div.7 = udiv <16 x i32> %div.6, %y + %div.8 = udiv <16 x i32> %div.7, %y + %div.9 = udiv <16 x i32> %div.8, %y + %div.10 = udiv <16 x i32> %div.9, %y + %div.11 = udiv <16 x i32> %div.10, %y + %div.12 = udiv <16 x i32> %div.11, %y + ret <16 x i32> %div.12 +} + +; CHECK-LABEL: define amdgpu_kernel void @caller_vecbonus +; CHECK-NOT: udiv +; CHECK: tail call <16 x i32> @div_vecbonus +; CHECK: ret void +define amdgpu_kernel void @caller_vecbonus(<16 x i32> addrspace(1)* nocapture %x, <16 x i32> addrspace(1)* nocapture readonly %y) { +entry: + %tmp = load <16 x i32>, <16 x i32> addrspace(1)* %x + %tmp1 = load <16 x i32>, <16 x i32> addrspace(1)* %y + %div.i = tail call <16 x i32> @div_vecbonus(<16 x i32> %tmp, <16 x i32> %tmp1) + store <16 x i32> %div.i, <16 x i32> addrspace(1)* %x + ret void +}