diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -289,6 +289,9 @@ /// individual classes of instructions would be better. unsigned getInliningThresholdMultiplier() const; + /// \returns A value to be added to the inlining threshold. + unsigned adjustInliningThreshold(const CallBase *CB) const; + /// \returns Vector bonus in percent. /// /// Vector bonuses: We want to more aggressively inline vector-dense kernels @@ -1395,6 +1398,7 @@ ArrayRef Operands, TTI::TargetCostKind CostKind) = 0; virtual unsigned getInliningThresholdMultiplier() = 0; + virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0; virtual int getInlinerVectorBonusPercent() = 0; virtual int getMemcpyCost(const Instruction *I) = 0; virtual unsigned @@ -1679,6 +1683,9 @@ unsigned getInliningThresholdMultiplier() override { return Impl.getInliningThresholdMultiplier(); } + unsigned adjustInliningThreshold(const CallBase *CB) override { + return Impl.adjustInliningThreshold(CB); + } int getInlinerVectorBonusPercent() override { return Impl.getInlinerVectorBonusPercent(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -67,6 +67,7 @@ } unsigned getInliningThresholdMultiplier() const { return 1; } + unsigned adjustInliningThreshold(const CallBase *CB) const { return 0; } int getInlinerVectorBonusPercent() const { return 150; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -401,6 +401,7 @@ } unsigned getInliningThresholdMultiplier() { return 1; } + unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } int getInlinerVectorBonusPercent() { return 150; } diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -1580,6 +1580,7 @@ // Finally, take the target-specific inlining threshold multiplier into // account. Threshold *= TTI.getInliningThresholdMultiplier(); + Threshold += TTI.adjustInliningThreshold(&Call); SingleBBBonus = Threshold * SingleBBBonusPercent / 100; VectorBonus = Threshold * VectorBonusPercent / 100; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -247,6 +247,11 @@ return TTIImpl->getInliningThresholdMultiplier(); } +unsigned +TargetTransformInfo::adjustInliningThreshold(const CallBase *CB) const { + return TTIImpl->adjustInliningThreshold(CB); +} + int TargetTransformInfo::getInlinerVectorBonusPercent() const { return TTIImpl->getInlinerVectorBonusPercent(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -327,9 +327,6 @@ void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &); -Pass *createAMDGPUFunctionInliningPass(); -void initializeAMDGPUInlinerPass(PassRegistry&); - ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp deleted file mode 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp +++ /dev/null @@ -1,195 +0,0 @@ -//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This is AMDGPU specific replacement of the standard inliner. -/// The main purpose is to account for the fact that calls not only expensive -/// on the AMDGPU, but much more expensive if a private memory pointer is -/// passed to a function as an argument. In this situation, we are unable to -/// eliminate private memory in the caller unless inlined and end up with slow -/// and expensive scratch access. Thus, we boost the inline threshold for such -/// functions here. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/Instructions.h" -#include "llvm/InitializePasses.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Transforms/IPO/Inliner.h" - -using namespace llvm; - -#define DEBUG_TYPE "inline" - -static cl::opt -ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), - cl::desc("Cost of alloca argument")); - -// If the amount of scratch memory to eliminate exceeds our ability to allocate -// it into registers we gain nothing by aggressively inlining functions for that -// heuristic. -static cl::opt -ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), - cl::desc("Maximum alloca size to use for inline cost")); - -// Inliner constraint to achieve reasonable compilation time -static cl::opt -MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), - cl::desc("Maximum BB number allowed in a function after inlining" - " (compile time constraint)")); - -namespace { - -class AMDGPUInliner : public LegacyInlinerBase { - -public: - AMDGPUInliner() : LegacyInlinerBase(ID) { - initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry()); - Params = getInlineParams(); - } - - static char ID; // Pass identification, replacement for typeid - - unsigned getInlineThreshold(CallBase &CB) const; - - InlineCost getInlineCost(CallBase &CB) override; - - bool runOnSCC(CallGraphSCC &SCC) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override; - -private: - TargetTransformInfoWrapperPass *TTIWP; - - InlineParams Params; -}; - -} // end anonymous namespace - -char AMDGPUInliner::ID = 0; -INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", - "AMDGPU Function Integration/Inlining", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline", - "AMDGPU Function Integration/Inlining", false, false) - -Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); } - -bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) { - TTIWP = &getAnalysis(); - return LegacyInlinerBase::runOnSCC(SCC); -} - -void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); - LegacyInlinerBase::getAnalysisUsage(AU); -} - -unsigned AMDGPUInliner::getInlineThreshold(CallBase &CB) const { - int Thres = Params.DefaultThreshold; - - Function *Caller = CB.getCaller(); - // Listen to the inlinehint attribute when it would increase the threshold - // and the caller does not need to minimize its size. - Function *Callee = CB.getCalledFunction(); - bool InlineHint = Callee && !Callee->isDeclaration() && - Callee->hasFnAttribute(Attribute::InlineHint); - if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres - && !Caller->hasFnAttribute(Attribute::MinSize)) - Thres = Params.HintThreshold.getValue() * - TTIWP->getTTI(*Callee).getInliningThresholdMultiplier(); - - const DataLayout &DL = Caller->getParent()->getDataLayout(); - if (!Callee) - return (unsigned)Thres; - - // If we have a pointer to private array passed into a function - // it will not be optimized out, leaving scratch usage. - // Increase the inline threshold to allow inliniting in this case. - uint64_t AllocaSize = 0; - SmallPtrSet AIVisited; - for (Value *PtrArg : CB.args()) { - PointerType *Ty = dyn_cast(PtrArg->getType()); - if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS && - Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) - continue; - - PtrArg = getUnderlyingObject(PtrArg); - if (const AllocaInst *AI = dyn_cast(PtrArg)) { - if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) - continue; - AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); - // If the amount of stack memory is excessive we will not be able - // to get rid of the scratch anyway, bail out. - if (AllocaSize > ArgAllocaCutoff) { - AllocaSize = 0; - break; - } - } - } - if (AllocaSize) - Thres += ArgAllocaCost; - - return (unsigned)Thres; -} - -InlineCost AMDGPUInliner::getInlineCost(CallBase &CB) { - Function *Callee = CB.getCalledFunction(); - Function *Caller = CB.getCaller(); - - if (!Callee || Callee->isDeclaration()) - return llvm::InlineCost::getNever("undefined callee"); - - if (CB.isNoInline()) - return llvm::InlineCost::getNever("noinline"); - - TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); - if (!TTI.areInlineCompatible(Caller, Callee)) - return llvm::InlineCost::getNever("incompatible"); - - if (CB.hasFnAttr(Attribute::AlwaysInline)) { - auto IsViable = isInlineViable(*Callee); - if (IsViable.isSuccess()) - return llvm::InlineCost::getAlways("alwaysinline viable"); - return llvm::InlineCost::getNever(IsViable.getFailureReason()); - } - - InlineParams LocalParams = Params; - LocalParams.DefaultThreshold = (int)getInlineThreshold(CB); - bool RemarksEnabled = false; - const auto &BBs = Caller->getBasicBlockList(); - if (!BBs.empty()) { - auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); - if (DI.isEnabled()) - RemarksEnabled = true; - } - - OptimizationRemarkEmitter ORE(Caller); - auto GetAssumptionCache = [this](Function &F) -> AssumptionCache & { - return ACT->getAssumptionCache(F); - }; - - auto IC = llvm::getInlineCost(CB, Callee, LocalParams, TTI, - GetAssumptionCache, GetTLI, nullptr, PSI, - RemarksEnabled ? &ORE : nullptr); - - if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) { - // Single BB does not increase total BB amount, thus subtract 1 - size_t Size = Caller->size() + Callee->size() - 1; - if (MaxBB && Size > MaxBB) - return llvm::InlineCost::getNever("max number of bb exceeded"); - } - return IC; -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -255,7 +255,6 @@ initializeAMDGPUExternalAAWrapperPass(*PR); initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); - initializeAMDGPUInlinerPass(*PR); initializeAMDGPUPrintfRuntimeBindingPass(*PR); initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); @@ -423,7 +422,7 @@ if (EnableFunctionCalls) { delete Builder.Inliner; - Builder.Inliner = createAMDGPUFunctionInliningPass(); + Builder.Inliner = createFunctionInliningPass(); } Builder.addExtension( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -203,6 +203,7 @@ const Function *Callee) const; unsigned getInliningThresholdMultiplier() { return 11; } + unsigned adjustInliningThreshold(const CallBase *CB) const; int getInlinerVectorBonusPercent() { return 0; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -56,6 +56,24 @@ cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden); +static cl::opt ArgAllocaCost("amdgpu-inline-arg-alloca-cost", + cl::Hidden, cl::init(4000), + cl::desc("Cost of alloca argument")); + +// If the amount of scratch memory to eliminate exceeds our ability to allocate +// it into registers we gain nothing by aggressively inlining functions for that +// heuristic. +static cl::opt + ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, + cl::init(256), + cl::desc("Maximum alloca size to use for inline cost")); + +// Inliner constraint to achieve reasonable compilation time. +static cl::opt InlineMaxBB( + "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), + cl::desc("Maximum number of BBs allowed in a function after inlining" + " (compile time constraint)")); + static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth = 0) { const Instruction *I = dyn_cast(Cond); @@ -1120,7 +1138,47 @@ // no way to support merge for backend defined attributes. AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); - return CallerMode.isInlineCompatible(CalleeMode); + if (!CallerMode.isInlineCompatible(CalleeMode)) + return false; + + // Hack to make compile times reasonable. + if (InlineMaxBB && !Callee->hasFnAttribute(Attribute::InlineHint)) { + // Single BB does not increase total BB amount, thus subtract 1. + size_t BBSize = Caller->size() + Callee->size() - 1; + return BBSize <= InlineMaxBB; + } + + return true; +} + +unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { + // If we have a pointer to private array passed into a function + // it will not be optimized out, leaving scratch usage. + // Increase the inline threshold to allow inlining in this case. + uint64_t AllocaSize = 0; + SmallPtrSet AIVisited; + for (Value *PtrArg : CB->args()) { + PointerType *Ty = dyn_cast(PtrArg->getType()); + if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS && + Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) + continue; + + PtrArg = getUnderlyingObject(PtrArg); + if (const AllocaInst *AI = dyn_cast(PtrArg)) { + if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) + continue; + AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); + // If the amount of stack memory is excessive we will not be able + // to get rid of the scratch anyway, bail out. + if (AllocaSize > ArgAllocaCutoff) { + AllocaSize = 0; + break; + } + } + } + if (AllocaSize) + return ArgAllocaCost; + return 0; } void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -86,7 +86,6 @@ AMDGPUTargetTransformInfo.cpp AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp - AMDGPUInline.cpp AMDGPUPerfHintAnalysis.cpp AMDILCFGStructurizer.cpp AMDGPUPrintfRuntimeBinding.cpp diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll @@ -1,5 +1,7 @@ ; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -O3 -S -inline-threshold=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INL1 %s ; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -O3 -S < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INLDEF %s +; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -passes='default' -S -inline-threshold=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INL1 %s +; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -passes='default' -S < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INLDEF %s define coldcc float @foo(float %x, float %y) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/inline-maxbb.ll b/llvm/test/CodeGen/AMDGPU/inline-maxbb.ll --- a/llvm/test/CodeGen/AMDGPU/inline-maxbb.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-maxbb.ll @@ -1,5 +1,7 @@ -; RUN: opt -mtriple=amdgcn-- --amdgpu-inline -S -amdgpu-inline-max-bb=2 %s | FileCheck %s --check-prefix=NOINL -; RUN: opt -mtriple=amdgcn-- --amdgpu-inline -S -amdgpu-inline-max-bb=3 %s | FileCheck %s --check-prefix=INL +; RUN: opt -mtriple=amdgcn-- -inline -S -amdgpu-inline-max-bb=2 %s | FileCheck %s --check-prefix=NOINL +; RUN: opt -mtriple=amdgcn-- -inline -S -amdgpu-inline-max-bb=3 %s | FileCheck %s --check-prefix=INL +; RUN: opt -mtriple=amdgcn-- -passes=inline -S -amdgpu-inline-max-bb=2 %s | FileCheck %s --check-prefix=NOINL +; RUN: opt -mtriple=amdgcn-- -passes=inline -S -amdgpu-inline-max-bb=3 %s | FileCheck %s --check-prefix=INL define i32 @callee(i32 %x) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -23,7 +23,7 @@ ; GCN-O0-NEXT: Force set function attributes ; GCN-O0-NEXT: CallGraph Construction ; GCN-O0-NEXT: Call Graph SCC Pass Manager -; GCN-O0-NEXT: AMDGPU Function Integration/Inlining +; GCN-O0-NEXT: Function Integration/Inlining ; GCN-O0-NEXT: A No-Op Barrier Pass @@ -97,7 +97,7 @@ ; GCN-O1-NEXT: Globals Alias Analysis ; GCN-O1-NEXT: Call Graph SCC Pass Manager ; GCN-O1-NEXT: Remove unused exception handling info -; GCN-O1-NEXT: AMDGPU Function Integration/Inlining +; GCN-O1-NEXT: Function Integration/Inlining ; GCN-O1-NEXT: Deduce function attributes ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces @@ -408,7 +408,7 @@ ; GCN-O2-NEXT: Globals Alias Analysis ; GCN-O2-NEXT: Call Graph SCC Pass Manager ; GCN-O2-NEXT: Remove unused exception handling info -; GCN-O2-NEXT: AMDGPU Function Integration/Inlining +; GCN-O2-NEXT: Function Integration/Inlining ; GCN-O2-NEXT: OpenMP specific optimizations ; GCN-O2-NEXT: Deduce function attributes ; GCN-O2-NEXT: FunctionPass Manager @@ -770,7 +770,7 @@ ; GCN-O3-NEXT: Globals Alias Analysis ; GCN-O3-NEXT: Call Graph SCC Pass Manager ; GCN-O3-NEXT: Remove unused exception handling info -; GCN-O3-NEXT: AMDGPU Function Integration/Inlining +; GCN-O3-NEXT: Function Integration/Inlining ; GCN-O3-NEXT: OpenMP specific optimizations ; GCN-O3-NEXT: Deduce function attributes ; GCN-O3-NEXT: Promote 'by reference' arguments to scalars diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument.ll --- a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument.ll +++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument.ll @@ -1,4 +1,5 @@ -; RUN: opt -mtriple=amdgcn--amdhsa -S -amdgpu-inline -inline-threshold=0 < %s | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -inline -inline-threshold=0 < %s | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 < %s | FileCheck %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" diff --git a/llvm/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll b/llvm/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll --- a/llvm/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll +++ b/llvm/test/Transforms/Inline/AMDGPU/inline-amdgpu-vecbonus.ll @@ -1,4 +1,5 @@ -; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-inline --inline-threshold=1 < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -inline --inline-threshold=1 < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=inline --inline-threshold=1 < %s | FileCheck %s define hidden <16 x i32> @div_vecbonus(<16 x i32> %x, <16 x i32> %y) { entry: diff --git a/llvm/test/Transforms/Inline/AMDGPU/inline-hint.ll b/llvm/test/Transforms/Inline/AMDGPU/inline-hint.ll --- a/llvm/test/Transforms/Inline/AMDGPU/inline-hint.ll +++ b/llvm/test/Transforms/Inline/AMDGPU/inline-hint.ll @@ -1,4 +1,5 @@ -; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-inline --inline-threshold=1 --inlinehint-threshold=2 < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -inline --inline-threshold=1 --inlinehint-threshold=4 < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=inline --inline-threshold=1 --inlinehint-threshold=4 < %s | FileCheck %s define hidden <16 x i32> @div_hint(<16 x i32> %x, <16 x i32> %y) #0 { entry: diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -137,7 +137,6 @@ "AMDGPUHSAMetadataStreamer.cpp", "AMDGPUISelDAGToDAG.cpp", "AMDGPUISelLowering.cpp", - "AMDGPUInline.cpp", "AMDGPUInstCombineIntrinsic.cpp", "AMDGPUInstrInfo.cpp", "AMDGPUInstructionSelector.cpp",