Index: llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -24,12 +24,6 @@ namespace { -static cl::opt StressCalls( - "amdgpu-stress-function-calls", - cl::Hidden, - cl::desc("Force all functions to be noinline"), - cl::init(false)); - class AMDGPUAlwaysInline : public ModulePass { bool GlobalOpt; @@ -131,18 +125,13 @@ recursivelyVisitUsers(GV, FuncsToAlwaysInline); } - if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) { - auto IncompatAttr - = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline; + if (!AMDGPUTargetMachine::EnableFunctionCalls) { + auto IncompatAttr = Attribute::NoInline; for (Function &F : M) { if (!F.isDeclaration() && !F.use_empty() && !F.hasFnAttribute(IncompatAttr)) { - if (StressCalls) { - if (!FuncsToAlwaysInline.count(&F)) - FuncsToNoInline.insert(&F); - } else - FuncsToAlwaysInline.insert(&F); + FuncsToAlwaysInline.insert(&F); } } } Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -974,9 +974,12 @@ if (LowerCtorDtor) addPass(createAMDGPUCtorDtorLoweringLegacyPass()); - // Function calls are not supported, so make sure we inline everything. - addPass(createAMDGPUAlwaysInlinePass()); - addPass(createAlwaysInlinerLegacyPass()); + if (!AMDGPUTargetMachine::EnableFunctionCalls || + !AMDGPUTargetMachine::EnableLowerModuleLDS) { + // Function calls are not supported, so make sure we inline everything. + addPass(createAMDGPUAlwaysInlinePass()); + addPass(createAlwaysInlinerLegacyPass()); + } // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. if (TM.getTargetTriple().getArch() == Triple::r600) Index: llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll +++ llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll @@ -1,11 +1,9 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls -amdgpu-stress-function-calls -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls=false -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s @lds0 = addrspace(3) global i32 undef, align 4 ; GCN-NOT: load_lds_simple - define internal i32 @load_lds_simple() { %load = load i32, ptr addrspace(3) @lds0, align 4 ret i32 %load Index: llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll +++ llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll @@ -1,7 +1,7 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -passes=amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -passes=amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls=0 -passes=amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls=0 -passes=amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -18,9 +18,9 @@ ; GCN-O0-NEXT:Machine Module Information ; GCN-O0-NEXT:Target Transform Information ; GCN-O0-NEXT:Assumption Cache Tracker -; GCN-O0-NEXT:Profile summary info ; GCN-O0-NEXT:Argument Register Usage Information Storage ; GCN-O0-NEXT:Create Garbage Collector Module Metadata +; GCN-O0-NEXT:Profile summary info ; GCN-O0-NEXT:Register Usage Information Storage ; GCN-O0-NEXT:Machine Branch Probability Analysis ; GCN-O0-NEXT: ModulePass Manager @@ -32,12 +32,6 @@ ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Dominator Tree Construction ; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU -; GCN-O0-NEXT: AMDGPU Inline All Functions -; GCN-O0-NEXT: Inliner for always_inline functions -; GCN-O0-NEXT: FunctionPass Manager -; GCN-O0-NEXT: Dominator Tree Construction -; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O0-NEXT: Function Alias Analysis Results ; GCN-O0-NEXT: Lower OpenCL enqueued blocks ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager @@ -157,11 +151,11 @@ ; GCN-O1-NEXT:Machine Module Information ; GCN-O1-NEXT:Target Transform Information ; GCN-O1-NEXT:Assumption Cache Tracker -; GCN-O1-NEXT:Profile summary info ; GCN-O1-NEXT:AMDGPU Address space based Alias Analysis ; GCN-O1-NEXT:External Alias Analysis ; GCN-O1-NEXT:Type-Based Alias Analysis ; GCN-O1-NEXT:Scoped NoAlias Alias Analysis +; GCN-O1-NEXT:Profile summary info ; GCN-O1-NEXT:Argument Register Usage Information Storage ; GCN-O1-NEXT:Create Garbage Collector Module Metadata ; GCN-O1-NEXT:Machine Branch Probability Analysis @@ -177,12 +171,6 @@ ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Dominator Tree Construction ; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU -; GCN-O1-NEXT: AMDGPU Inline All Functions -; GCN-O1-NEXT: Inliner for always_inline functions -; GCN-O1-NEXT: FunctionPass Manager -; GCN-O1-NEXT: Dominator Tree Construction -; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Lower OpenCL enqueued blocks ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: AMDGPU Attributor @@ -433,11 +421,11 @@ ; GCN-O1-OPTS-NEXT:Machine Module Information ; GCN-O1-OPTS-NEXT:Target Transform Information ; GCN-O1-OPTS-NEXT:Assumption Cache Tracker -; GCN-O1-OPTS-NEXT:Profile summary info ; GCN-O1-OPTS-NEXT:AMDGPU Address space based Alias Analysis ; GCN-O1-OPTS-NEXT:External Alias Analysis ; GCN-O1-OPTS-NEXT:Type-Based Alias Analysis ; GCN-O1-OPTS-NEXT:Scoped NoAlias Alias Analysis +; GCN-O1-OPTS-NEXT:Profile summary info ; GCN-O1-OPTS-NEXT:Argument Register Usage Information Storage ; GCN-O1-OPTS-NEXT:Create Garbage Collector Module Metadata ; GCN-O1-OPTS-NEXT:Machine Branch Probability Analysis @@ -453,12 +441,6 @@ ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Dominator Tree Construction ; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU -; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions -; GCN-O1-OPTS-NEXT: Inliner for always_inline functions -; GCN-O1-OPTS-NEXT: FunctionPass Manager -; GCN-O1-OPTS-NEXT: Dominator Tree Construction -; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Lower OpenCL enqueued blocks ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: AMDGPU Attributor @@ -731,11 +713,11 @@ ; GCN-O2-NEXT:Machine Module Information ; GCN-O2-NEXT:Target Transform Information ; GCN-O2-NEXT:Assumption Cache Tracker -; GCN-O2-NEXT:Profile summary info ; GCN-O2-NEXT:AMDGPU Address space based Alias Analysis ; GCN-O2-NEXT:External Alias Analysis ; GCN-O2-NEXT:Type-Based Alias Analysis ; GCN-O2-NEXT:Scoped NoAlias Alias Analysis +; GCN-O2-NEXT:Profile summary info ; GCN-O2-NEXT:Argument Register Usage Information Storage ; GCN-O2-NEXT:Create Garbage Collector Module Metadata ; GCN-O2-NEXT:Machine Branch Probability Analysis @@ -751,12 +733,6 @@ ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Dominator Tree Construction ; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU -; GCN-O2-NEXT: AMDGPU Inline All Functions -; GCN-O2-NEXT: Inliner for always_inline functions -; GCN-O2-NEXT: FunctionPass Manager -; GCN-O2-NEXT: Dominator Tree Construction -; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Lower OpenCL enqueued blocks ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: AMDGPU Attributor @@ -1059,12 +1035,6 @@ ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Dominator Tree Construction ; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU -; GCN-O3-NEXT: AMDGPU Inline All Functions -; GCN-O3-NEXT: Inliner for always_inline functions -; GCN-O3-NEXT: FunctionPass Manager -; GCN-O3-NEXT: Dominator Tree Construction -; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Lower OpenCL enqueued blocks ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: AMDGPU Attributor Index: llvm/test/CodeGen/AMDGPU/smed3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/smed3.ll +++ llvm/test/CodeGen/AMDGPU/smed3.ll @@ -1,6 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: opt -passes=inline -o %t.bc %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %t.bc | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %t.bc | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %t.bc | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s declare i32 @llvm.amdgcn.workitem.id.x() #0 Index: llvm/test/CodeGen/AMDGPU/stress-calls.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/stress-calls.ll +++ /dev/null @@ -1,36 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -passes=amdgpu-always-inline %s | FileCheck %s - -; CHECK: define internal fastcc i32 @alwaysinline_func(i32 %a) #0 { -define internal fastcc i32 @alwaysinline_func(i32 %a) alwaysinline { -entry: - %tmp0 = add i32 %a, 1 - ret i32 %tmp0 -} - -; CHECK: define internal fastcc i32 @noinline_func(i32 %a) #1 { -define internal fastcc i32 @noinline_func(i32 %a) noinline { -entry: - %tmp0 = add i32 %a, 2 - ret i32 %tmp0 -} - -; CHECK: define internal fastcc i32 @unmarked_func(i32 %a) #1 { -define internal fastcc i32 @unmarked_func(i32 %a) { -entry: - %tmp0 = add i32 %a, 3 - ret i32 %tmp0 -} - -define amdgpu_kernel void @kernel(ptr addrspace(1) %out) { -entry: - %tmp0 = call i32 @alwaysinline_func(i32 1) - store volatile i32 %tmp0, ptr addrspace(1) %out - %tmp1 = call i32 @noinline_func(i32 1) - store volatile i32 %tmp1, ptr addrspace(1) %out - %tmp2 = call i32 @unmarked_func(i32 1) - store volatile i32 %tmp2, ptr addrspace(1) %out - ret void -} - -; CHECK: attributes #0 = { alwaysinline } -; CHECK: attributes #1 = { noinline } Index: llvm/test/CodeGen/AMDGPU/umed3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/umed3.ll +++ llvm/test/CodeGen/AMDGPU/umed3.ll @@ -1,6 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: opt -passes=inline -o %t.bc %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %t.bc | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %t.bc | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %t.bc | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s declare i32 @llvm.amdgcn.workitem.id.x() #0