Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -182,6 +182,9 @@ void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &); +Pass *createAMDGPUFunctionInliningPass(); +void initializeAMDGPUInlinerPass(PassRegistry&); + Target &getTheAMDGPUTarget(); Target &getTheGCNTarget(); Index: lib/Target/AMDGPU/AMDGPUInline.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUInline.cpp @@ -0,0 +1,220 @@ +//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This is AMDGPU specific replacement of the standard inliner. +/// The main purpose is to account for the fact that calls not only expensive +/// on the AMDGPU, but much more expensive if a private memory pointer is +/// passed to a function as an argument. In this situation, we are unable to +/// eliminate private memory in the caller unless inlined and end up with slow +/// and expensive scratch access. Thus, we boost the inline threshold for such +/// functions here. +/// +//===----------------------------------------------------------------------===// + + +#include "AMDGPU.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/IPO/Inliner.h" + +using namespace llvm; + +#define DEBUG_TYPE "inline" + +static cl::opt +ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), + cl::desc("Cost of alloca argument")); + +// If the amount of scratch memory to eliminate exceeds our ability to allocate +// it into registers we gain nothing by agressively inlining functions for that +// heuristic. +static cl::opt +ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), + cl::desc("Maximum alloca size to use for inline cost")); + +// The limit of the program size to apply agressive inlining. With a lot of +// inlining we suffer a very slow compilation for huge programs and this +// threshold is to mitigate it. +static cl::opt +MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300), + cl::desc("Maximum BB number allowed in a function after inlining")); + +namespace { + +class AMDGPUInliner : public LegacyInlinerBase { + +public: + AMDGPUInliner() : LegacyInlinerBase(ID) { + initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry()); + Params = getInlineParams(); + } + + static char ID; // Pass identification, replacement for typeid + + unsigned getInlineThreshold(CallSite CS) const; + + InlineCost getInlineCost(CallSite CS) override; + + bool runOnSCC(CallGraphSCC &SCC) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + TargetTransformInfoWrapperPass *TTIWP; + + InlineParams Params; +}; + +} // end anonymous namespace + +char AMDGPUInliner::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", + "AMDGPU Function Integration/Inlining", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline", + "AMDGPU Function Integration/Inlining", false, false) + +Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); } + +bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) { + TTIWP = &getAnalysis(); + return LegacyInlinerBase::runOnSCC(SCC); +} + +void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + LegacyInlinerBase::getAnalysisUsage(AU); +} + +unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { + int Thres = Params.DefaultThreshold; + + Function *Caller = CS.getCaller(); + // Listen to the inlinehint attribute when it would increase the threshold + // and the caller does not need to minimize its size. + Function *Callee = CS.getCalledFunction(); + bool InlineHint = Callee && !Callee->isDeclaration() && + Callee->hasFnAttribute(Attribute::InlineHint); + if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres + && !Caller->hasFnAttribute(Attribute::MinSize)) + Thres = Params.HintThreshold.getValue(); + + const DataLayout &DL = Caller->getParent()->getDataLayout(); + if (!Callee) + return (unsigned)Thres; + + const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*Caller->getParent()); + + // If we have a pointer to private array passed into a function + // it will not be optimized out, leaving scratch usage. + // Increase the inline threshold to allow inliniting in this case. + uint64_t AllocaSize = 0; + SmallPtrSet AIVisited; + for (Value *PtrArg : CS.args()) { + Type *Ty = PtrArg->getType(); + if (!Ty->isPointerTy() || + Ty->getPointerAddressSpace() != AS.PRIVATE_ADDRESS) + continue; + PtrArg = GetUnderlyingObject(PtrArg, DL); + if (const AllocaInst *AI = dyn_cast(PtrArg)) { + if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) + continue; + AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); + // If the amount of stack memory is excessive we will not be able + // to get rid of the scratch anyway, bail out. + if (AllocaSize > ArgAllocaCutoff) { + AllocaSize = 0; + break; + } + } + } + if (AllocaSize) + Thres += ArgAllocaCost; + + return (unsigned)Thres; +} + +// Check if call is just a wrapper around another call. +// In this case we only have call and ret instructions. +static bool isWrapperOnlyCall(CallSite CS) { + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee->size() != 1) + return false; + const BasicBlock &BB = Callee->getEntryBlock(); + if (const Instruction *I = BB.getFirstNonPHI()) { + if (!isa(I)) { + return false; + } + if (isa(*std::next(I->getIterator()))) { + DEBUG(dbgs() << " Wrapper only call detected: " + << Callee->getName() << '\n'); + return true; + } + } + return false; +} + +InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { + Function *Callee = CS.getCalledFunction(); + Function *Caller = CS.getCaller(); + TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); + + if (!Callee || Callee->isDeclaration() || CS.isNoInline() || + !TTI.areInlineCompatible(Caller, Callee)) + return llvm::InlineCost::getNever(); + + if (CS.hasFnAttr(Attribute::AlwaysInline)) { + if (isInlineViable(*Callee)) + return llvm::InlineCost::getAlways(); + return llvm::InlineCost::getNever(); + } + + if (isWrapperOnlyCall(CS)) + return llvm::InlineCost::getAlways(); + + // Single BB does not increase total BB amount, thus subtract 1 + size_t Size = Caller->size() + Callee->size() - 1; + if (MaxBB && Size > MaxBB) + return InlineCost::getNever(); + + InlineParams LocalParams = Params; + LocalParams.DefaultThreshold = (int)getInlineThreshold(CS); + bool RemarksEnabled = false; + const auto &BBs = Caller->getBasicBlockList(); + if (!BBs.empty()) { + auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); + if (DI.isEnabled()) + RemarksEnabled = true; + } + + OptimizationRemarkEmitter ORE(Caller); + std::function GetAssumptionCache = + [this](Function &F) -> AssumptionCache & { + return ACT->getAssumptionCache(F); + }; + + return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache, + None, PSI, RemarksEnabled ? &ORE : nullptr); +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -179,6 +179,7 @@ initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); + initializeAMDGPUInlinerPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -325,10 +326,12 @@ bool EnableOpt = getOptLevel() > CodeGenOpt::None; bool Internalize = InternalizeSymbols && EnableOpt && (getTargetTriple().getArch() == Triple::amdgcn); - bool EarlyInline = EarlyInlineAll && EnableOpt; + bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls; bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; + Builder.Inliner = createAMDGPUFunctionInliningPass(); + Builder.addExtension( PassManagerBuilder::EP_ModuleOptimizerEarly, [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &, Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -162,6 +162,8 @@ bool areInlineCompatible(const Function *Caller, const Function *Callee) const; + + unsigned getInliningThresholdMultiplier() { return 9; } }; } // end namespace llvm Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -51,6 +51,7 @@ AMDGPUTargetTransformInfo.cpp AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp + AMDGPUInline.cpp AMDILCFGStructurizer.cpp GCNHazardRecognizer.cpp GCNIterativeScheduler.cpp Index: test/CodeGen/AMDGPU/amdgpu-inline.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/amdgpu-inline.ll @@ -0,0 +1,152 @@ +; RUN: opt -mtriple=amdgcn--amdhsa -O3 -S -amdgpu-function-calls -inline-threshold=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INL1 %s +; RUN: opt -mtriple=amdgcn--amdhsa -O3 -S -amdgpu-function-calls < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INLDEF %s + +define coldcc float @foo(float %x, float %y) { +entry: + %cmp = fcmp ogt float %x, 0.000000e+00 + %div = fdiv float %y, %x + %mul = fmul float %x, %y + %cond = select i1 %cmp, float %div, float %mul + ret float %cond +} + +define coldcc void @foo_private_ptr(float* nocapture %p) { +entry: + %tmp1 = load float, float* %p, align 4 + %cmp = fcmp ogt float %tmp1, 1.000000e+00 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %div = fdiv float 1.000000e+00, %tmp1 + store float %div, float* %p, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +define coldcc void @foo_private_ptr2(float* nocapture %p1, float* nocapture %p2) { +entry: + %tmp1 = load float, float* %p1, align 4 + %cmp = fcmp ogt float %tmp1, 1.000000e+00 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %div = fdiv float 2.000000e+00, %tmp1 + store float %div, float* %p2, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +define coldcc float @sin_wrapper(float %x) { +bb: + %call = tail call float @_Z3sinf(float %x) + ret float %call +} + +define void @foo_noinline(float* nocapture %p) #0 { +entry: + %tmp1 = load float, float* %p, align 4 + %mul = fmul float %tmp1, 2.000000e+00 + store float %mul, float* %p, align 4 + ret void +} + +; GCN: define amdgpu_kernel void @test_inliner( +; GCN-INL1: %c1 = tail call coldcc float @foo( +; GCN-INLDEF: %cmp.i = fcmp ogt float %tmp2, 0.000000e+00 +; GCN: %div.i{{[0-9]*}} = fdiv float 1.000000e+00, %c +; GCN: %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i +; GCN: call void @foo_noinline( +; GCN: tail call float @_Z3sinf( +define amdgpu_kernel void @test_inliner(float addrspace(1)* nocapture %a, i32 %n) { +entry: + %pvt_arr = alloca [64 x float], align 4 + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid + %tmp2 = load float, float addrspace(1)* %arrayidx, align 4 + %add = add i32 %tid, 1 + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %a, i32 %add + %tmp5 = load float, float addrspace(1)* %arrayidx2, align 4 + %c1 = tail call coldcc float @foo(float %tmp2, float %tmp5) + %or = or i32 %tid, %n + %arrayidx5 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 %or + store float %c1, float* %arrayidx5, align 4 + %arrayidx7 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 %or + call coldcc void @foo_private_ptr(float* %arrayidx7) + %arrayidx8 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 1 + %arrayidx9 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 2 + call coldcc void @foo_private_ptr2(float* %arrayidx8, float* %arrayidx9) + call void @foo_noinline(float* %arrayidx7) + %and = and i32 %tid, %n + %arrayidx11 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 %and + %tmp12 = load float, float* %arrayidx11, align 4 + %c2 = call coldcc float @sin_wrapper(float %tmp12) + store float %c2, float* %arrayidx7, align 4 + %xor = xor i32 %tid, %n + %arrayidx16 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 %xor + %tmp16 = load float, float* %arrayidx16, align 4 + store float %tmp16, float addrspace(1)* %arrayidx, align 4 + ret void +} + +; GCN: define amdgpu_kernel void @test_inliner_multi_pvt_ptr( +; GCN: %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i +define amdgpu_kernel void @test_inliner_multi_pvt_ptr(float addrspace(1)* nocapture %a, i32 %n, float %v) { +entry: + %pvt_arr1 = alloca [32 x float], align 4 + %pvt_arr2 = alloca [32 x float], align 4 + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid + %or = or i32 %tid, %n + %arrayidx4 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 %or + %arrayidx5 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr2, i32 0, i32 %or + store float %v, float* %arrayidx4, align 4 + store float %v, float* %arrayidx5, align 4 + %arrayidx8 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 1 + %arrayidx9 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr2, i32 0, i32 2 + call coldcc void @foo_private_ptr2(float* %arrayidx8, float* %arrayidx9) + %xor = xor i32 %tid, %n + %arrayidx15 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 %xor + %arrayidx16 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr2, i32 0, i32 %xor + %tmp15 = load float, float* %arrayidx15, align 4 + %tmp16 = load float, float* %arrayidx16, align 4 + %tmp17 = fadd float %tmp15, %tmp16 + store float %tmp17, float addrspace(1)* %arrayidx, align 4 + ret void +} + +; GCN: define amdgpu_kernel void @test_inliner_multi_pvt_ptr_cutoff( +; GCN-INL1: call coldcc void @foo_private_ptr2 +; GCN-INLDEF: %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i +define amdgpu_kernel void @test_inliner_multi_pvt_ptr_cutoff(float addrspace(1)* nocapture %a, i32 %n, float %v) { +entry: + %pvt_arr1 = alloca [32 x float], align 4 + %pvt_arr2 = alloca [33 x float], align 4 + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid + %or = or i32 %tid, %n + %arrayidx4 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 %or + %arrayidx5 = getelementptr inbounds [33 x float], [33 x float]* %pvt_arr2, i32 0, i32 %or + store float %v, float* %arrayidx4, align 4 + store float %v, float* %arrayidx5, align 4 + %arrayidx8 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 1 + %arrayidx9 = getelementptr inbounds [33 x float], [33 x float]* %pvt_arr2, i32 0, i32 2 + call coldcc void @foo_private_ptr2(float* %arrayidx8, float* %arrayidx9) + %xor = xor i32 %tid, %n + %arrayidx15 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 %xor + %arrayidx16 = getelementptr inbounds [33 x float], [33 x float]* %pvt_arr2, i32 0, i32 %xor + %tmp15 = load float, float* %arrayidx15, align 4 + %tmp16 = load float, float* %arrayidx16, align 4 + %tmp17 = fadd float %tmp15, %tmp16 + store float %tmp17, float addrspace(1)* %arrayidx, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare float @_Z3sinf(float) #1 + +attributes #0 = { noinline } +attributes #1 = { nounwind readnone }