Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -84,6 +84,13 @@ cl::init(false), cl::Hidden); +// Option to run internalize pass. +static cl::opt InternalizeSymbols( + "amdgpu-internalize-symbols", + cl::desc("Enable elimination of non-kernel functions and unused globals"), + cl::init(false), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -207,11 +214,35 @@ } void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { + bool Internalize = InternalizeSymbols && + (getOptLevel() > CodeGenOpt::None) && + (getTargetTriple().getArch() == Triple::amdgcn); Builder.addExtension( PassManagerBuilder::EP_ModuleOptimizerEarly, - [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + [Internalize](const PassManagerBuilder &, legacy::PassManagerBase &PM) { PM.add(createAMDGPUUnifyMetadataPass()); - }); + if (Internalize) { + PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool { + if (const Function *F = dyn_cast(&GV)) { + if (F->isDeclaration()) + return true; + switch (F->getCallingConv()) { + default: + return false; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return true; + } + } + return !GV.use_empty(); + })); + PM.add(createGlobalDCEPass()); + } + }); } //===----------------------------------------------------------------------===// Index: llvm/trunk/test/CodeGen/AMDGPU/internalize.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/internalize.ll +++ llvm/trunk/test/CodeGen/AMDGPU/internalize.ll @@ -0,0 +1,35 @@ +; RUN: opt -O1 -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-internalize-symbols < %s | FileCheck %s +; CHECK-NOT: unused +; CHECK-NOT: foo_used +; CHECK: gvar_used +; CHECK: main_kernel + +@gvar_unused = addrspace(1) global i32 undef, align 4 +@gvar_used = addrspace(1) global i32 undef, align 4 + +; Function Attrs: alwaysinline nounwind +define void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 { +entry: + store i32 1, i32 addrspace(1)* %out + ret void +} + +; Function Attrs: alwaysinline nounwind +define void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 { +entry: + store i32 %tid, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @main_kernel() { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + tail call void @foo_used(i32 addrspace(1)* @gvar_used, i32 %tid) nounwind + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } + +attributes #1 = { alwaysinline nounwind }