Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -319,16 +319,34 @@ }); } +/// Predicate for Internalize pass. +bool mustPreserveGV(const GlobalValue &GV) { + if (const Function *F = dyn_cast(&GV)) + return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv()); + + return !GV.use_empty(); +} + void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { Builder.DivergentTarget = true; bool EnableOpt = getOptLevel() > CodeGenOpt::None; - bool Internalize = InternalizeSymbols && EnableOpt && - (getTargetTriple().getArch() == Triple::amdgcn); + bool Internalize = InternalizeSymbols; bool EarlyInline = EarlyInlineAll && EnableOpt; bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; + if (Internalize) { + // If we're generating code, we always have the whole program available. The + // relocations expected for externally visible functions aren't supported, + // so make sure every non-entry function is hidden. + Builder.addExtension( + PassManagerBuilder::EP_EnabledOnOptLevel0, + [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + PM.add(createInternalizePass(mustPreserveGV)); + }); + } + Builder.addExtension( PassManagerBuilder::EP_ModuleOptimizerEarly, [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &, @@ -339,25 +357,7 @@ } PM.add(createAMDGPUUnifyMetadataPass()); if (Internalize) { - PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool { - if (const Function *F = dyn_cast(&GV)) { - if (F->isDeclaration()) - return true; - switch (F->getCallingConv()) { - default: - return false; - case CallingConv::AMDGPU_VS: - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_GS: - case CallingConv::AMDGPU_PS: - case CallingConv::AMDGPU_CS: - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - return true; - } - } - return !GV.use_empty(); - })); + PM.add(createInternalizePass(mustPreserveGV)); PM.add(createGlobalDCEPass()); } if (EarlyInline) Index: test/CodeGen/AMDGPU/internalize.ll =================================================================== --- test/CodeGen/AMDGPU/internalize.ll +++ test/CodeGen/AMDGPU/internalize.ll @@ -1,35 +1,68 @@ -; RUN: opt -O1 -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-internalize-symbols < %s | FileCheck %s -; CHECK-NOT: unused -; CHECK-NOT: foo_used -; CHECK: gvar_used -; CHECK: main_kernel +; RUN: opt -O1 -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-internalize-symbols < %s | FileCheck -check-prefix=ALL -check-prefix=OPT %s +; RUN: opt -O0 -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-internalize-symbols < %s | FileCheck -check-prefix=ALL -check-prefix=OPTNONE %s +; OPT-NOT: gvar_unused +; OPTNONE: gvar_unused @gvar_unused = addrspace(1) global i32 undef, align 4 + +; ALL: gvar_used @gvar_used = addrspace(1) global i32 undef, align 4 -; Function Attrs: alwaysinline nounwind -define amdgpu_kernel void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 { +; ALL: define internal fastcc void @func_used( +define fastcc void @func_used(i32 addrspace(1)* %out, i32 %tid) #1 { +entry: + store volatile i32 %tid, i32 addrspace(1)* %out + ret void +} + +; ALL: define internal fastcc void @func_used_noinline( +define fastcc void @func_used_noinline(i32 addrspace(1)* %out, i32 %tid) #2 { +entry: + store volatile i32 %tid, i32 addrspace(1)* %out + ret void +} + +; OPTNONE: define internal fastcc void @func_used_alwaysinline( +; OPT-NOT: @func_used_alwaysinline +define fastcc void @func_used_alwaysinline(i32 addrspace(1)* %out, i32 %tid) #3 { entry: - store i32 1, i32 addrspace(1)* %out + store volatile i32 %tid, i32 addrspace(1)* %out ret void } -; Function Attrs: alwaysinline nounwind -define amdgpu_kernel void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 { +; OPTNONE: define internal void @func_unused( +; OPT-NOT: @func_unused +define void @func_unused(i32 addrspace(1)* %out, i32 %tid) #2 { entry: - store i32 %tid, i32 addrspace(1)* %out + store volatile i32 %tid, i32 addrspace(1)* %out ret void } +; ALL: define amdgpu_kernel void @kernel_unused( +define amdgpu_kernel void @kernel_unused(i32 addrspace(1)* %out) #1 { +entry: + store volatile i32 1, i32 addrspace(1)* %out + ret void +} + +; ALL: define amdgpu_kernel void @main_kernel() +; ALL: tail call i32 @llvm.amdgcn.workitem.id.x +; ALL: tail call fastcc void @func_used +; ALL: tail call fastcc void @func_used_noinline +; ALL: store volatile +; ALL: ret void define amdgpu_kernel void @main_kernel() { entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - tail call void @foo_used(i32 addrspace(1)* @gvar_used, i32 %tid) nounwind + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + tail call fastcc void @func_used(i32 addrspace(1)* @gvar_used, i32 %tid) + tail call fastcc void @func_used_noinline(i32 addrspace(1)* @gvar_used, i32 %tid) + tail call fastcc void @func_used_alwaysinline(i32 addrspace(1)* @gvar_used, i32 %tid) ret void } declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone } - -attributes #1 = { alwaysinline nounwind } +attributes #1 = { nounwind } +attributes #2 = { noinline nounwind } +attributes #3 = { alwaysinline nounwind }