diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -116,14 +116,21 @@ // OpenCL doesn't allow declaring LDS in non-kernels, so in practice this // should only appear when IPO passes manages to move LDs defined in a kernel // into a single user function. - - for (GlobalVariable &GV : M.globals()) { - // TODO: Region address - unsigned AS = GV.getAddressSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) - continue; - - recursivelyVisitUsers(GV, FuncsToAlwaysInline); + // + // Since now, LDS uses within non-kernel functions are being handled in the + // pass - `LowerModuleLDS`, we *NO* need to *forcefully* inline non-kernel + // functions just because they use LDS. Do forceful inlining only when the + // pass - `LowerModuleLDS` is not enabled. It is enabled by default. + + if (!AMDGPUTargetMachine::EnableLowerModuleLDS) { + for (GlobalVariable &GV : M.globals()) { + // TODO: Region address + unsigned AS = GV.getAddressSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) + continue; + + recursivelyVisitUsers(GV, FuncsToAlwaysInline); + } } if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -35,6 +35,7 @@ static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableFixedFunctionABI; + static bool EnableLowerModuleLDS; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -193,10 +193,10 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); -static cl::opt - DisableLowerModuleLDS("amdgpu-disable-lower-module-lds", cl::Hidden, - cl::desc("Disable lower module lds pass"), - cl::init(false)); +static cl::opt EnableLowerModuleLDS( + "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), + cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), + cl::Hidden); extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target @@ -399,6 +399,7 @@ bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; +bool AMDGPUTargetMachine::EnableLowerModuleLDS = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -894,7 +895,7 @@ addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); // Can increase LDS used by kernel so runs before PromoteAlloca - if (!DisableLowerModuleLDS) + if (EnableLowerModuleLDS) addPass(createAMDGPULowerModuleLDSPass()); if (TM.getOptLevel() > CodeGenOpt::None) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-enable-lower-module-lds=false -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s ; RUN: FileCheck -check-prefix=ERR %s < %t -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-lower-module-lds=false -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s ; RUN: FileCheck -check-prefix=ERR %s < %t @lds = internal addrspace(3) global float undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll @@ -1,4 +1,4 @@ -; RUN: not --crash llc -march=amdgcn -verify-machineinstrs -amdgpu-disable-lower-module-lds=true < %s 2>&1 | FileCheck -check-prefix=ERROR %s +; RUN: not --crash llc -march=amdgcn -verify-machineinstrs -amdgpu-enable-lower-module-lds=false < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; ERROR: LLVM ERROR: Unsupported expression in static initializer: addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*) diff --git a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll --- a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls -amdgpu-stress-function-calls < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls -amdgpu-stress-function-calls -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s @lds0 = addrspace(3) global i32 undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll --- a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll +++ b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll @@ -1,7 +1,7 @@ -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-always-inline %s | FileCheck --check-prefix=ALL %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-always-inline %s | FileCheck --check-prefix=ALL %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -amdgpu-always-inline %s | FileCheck --check-prefix=ALL %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -passes=amdgpu-always-inline %s | FileCheck --check-prefix=ALL %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -passes=amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - -amdgpu-enable-lower-module-lds=false %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: FileCheck -check-prefix=ERR %s < %t -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - -amdgpu-enable-lower-module-lds=false %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: FileCheck -check-prefix=ERR %s < %t @lds = internal addrspace(3) global float undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll @@ -1,5 +1,5 @@ ; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s -; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true < %s | FileCheck -check-prefix=ASM %s +; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=ASM %s target datalayout = "A5"