diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h --- a/clang/lib/Driver/ToolChains/Cuda.h +++ b/clang/lib/Driver/ToolChains/Cuda.h @@ -132,8 +132,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain { public: NVPTXToolChain(const Driver &D, const llvm::Triple &Triple, - const llvm::Triple &HostTriple, - const llvm::opt::ArgList &Args); + const llvm::Triple &HostTriple, const llvm::opt::ArgList &Args, + bool Freestanding); NVPTXToolChain(const Driver &D, const llvm::Triple &Triple, const llvm::opt::ArgList &Args); @@ -142,6 +142,11 @@ TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch, Action::OffloadKind DeviceOffloadKind) const override; + void + addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadKind) const override; + // Never try to use the integrated assembler with CUDA; always fork out to // ptxas. bool useIntegratedAs() const override { return false; } @@ -168,6 +173,9 @@ protected: Tool *buildAssembler() const override; // ptxas. Tool *buildLinker() const override; // nvlink. + +private: + bool Freestanding = false; }; class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain { diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -695,8 +695,9 @@ /// toolchain. NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple, const llvm::Triple &HostTriple, - const ArgList &Args) - : ToolChain(D, Triple, Args), CudaInstallation(D, HostTriple, Args) { + const ArgList &Args, bool Freestanding = false) + : ToolChain(D, Triple, Args), CudaInstallation(D, HostTriple, Args), + Freestanding(Freestanding) { if (CudaInstallation.isValid()) { CudaInstallation.WarnIfUnsupportedVersion(); getProgramPaths().push_back(std::string(CudaInstallation.getBinPath())); @@ -711,7 +712,8 @@ NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) : NVPTXToolChain(D, Triple, - llvm::Triple(llvm::sys::getDefaultTargetTriple()), Args) {} + llvm::Triple(llvm::sys::getDefaultTargetTriple()), Args, + /*Freestanding=*/true) {} llvm::opt::DerivedArgList * NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, @@ -735,6 +737,16 @@ return DAL; } +void NVPTXToolChain::addClangTargetOptions( + const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadingKind) const { + // If we are compiling with a standalone NVPTX toolchain we want to try to + // mimic a standard environment as much as possible. So we enable lowering + // ctor / dtor functions to global symbols that can be registered. + if (Freestanding) + CC1Args.append({"-mllvm", "--nvptx-lower-global-ctor-dtor"}); +} + bool NVPTXToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const { const Option &O = A->getOption(); return (O.matches(options::OPT_gN_Group) && diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c --- a/clang/test/Driver/cuda-cross-compiling.c +++ b/clang/test/Driver/cuda-cross-compiling.c @@ -68,3 +68,12 @@ // DEFAULT: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_35" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s" // DEFAULT-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_35" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c" // DEFAULT-NEXT: nvlink{{.*}}"-o" "a.out" "-arch" "sm_35" {{.*}} "[[CUBIN]].cubin" + +// +// Test to ensure that we enable handling global constructors in a freestanding +// Nvidia compilation. +// +// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_70 %s -### 2>&1 \ +// RUN: | FileCheck -check-prefix=LOWERING %s + +// LOWERING: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-mllvm" "--nvptx-lower-global-ctor-dtor" diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt --- a/llvm/lib/Target/NVPTX/CMakeLists.txt +++ b/llvm/lib/Target/NVPTX/CMakeLists.txt @@ -37,6 +37,7 @@ NVVMIntrRange.cpp NVVMReflect.cpp NVPTXProxyRegErasure.cpp + NVPTXCtorDtorLowering.cpp ) add_llvm_target(NVPTXCodeGen diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -39,6 +39,7 @@ llvm::CodeGenOpt::Level OptLevel); ModulePass *createNVPTXAssignValidGlobalNamesPass(); ModulePass *createGenericToNVVMLegacyPass(); +ModulePass *createNVPTXCtorDtorLoweringLegacyPass(); FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion); FunctionPass *createNVVMReflectPass(unsigned int SmVersion); MachineFunctionPass *createNVPTXPrologEpilogPass(); diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -92,6 +92,11 @@ using namespace llvm; +static cl::opt + LowerCtorDtor("nvptx-lower-global-ctor-dtor", + cl::desc("Lower GPU ctor / dtors to globals on the device."), + cl::init(false), cl::Hidden); + #define DEPOTNAME "__local_depot" /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V @@ -788,12 +793,14 @@ report_fatal_error("Module has aliases, which NVPTX does not support."); return true; // error } - if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_ctors"))) { + if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_ctors")) && + !LowerCtorDtor) { report_fatal_error( "Module has a nontrivial global ctor, which NVPTX does not support."); return true; // error } - if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_dtors"))) { + if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_dtors")) && + !LowerCtorDtor) { report_fatal_error( "Module has a nontrivial global dtor, which NVPTX does not support."); return true; // error diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.h b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.h new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.h @@ -0,0 +1,30 @@ +//===-- NVPTXCtorDtorLowering.h --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXCTORDTORLOWERING_H +#define LLVM_LIB_TARGET_NVPTX_NVPTXCTORDTORLOWERING_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { +class Module; +class PassRegistry; + +extern char &NVPTXCtorDtorLoweringLegacyPassID; +extern void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); + +/// Lower llvm.global_ctors and llvm.global_dtors to special kernels. +class NVPTXCtorDtorLoweringPass + : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_NVPTX_NVPTXCTORDTORLOWERING_H diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp @@ -0,0 +1,116 @@ +//===-- NVPTXCtorDtorLowering.cpp - Handle global ctors and dtors --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This pass creates a unified init and fini kernel with the required metadata +//===----------------------------------------------------------------------===// + +#include "NVPTXCtorDtorLowering.h" +#include "NVPTX.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "nvptx-lower-ctor-dtor" + +static cl::opt + GlobalStr("nvptx-lower-global-ctor-dtor-id", + cl::desc("Override unique ID of ctor/dtor globals."), + cl::init(""), cl::Hidden); + +namespace { + +static std::string getHash(StringRef Str) { + llvm::MD5 Hasher; + llvm::MD5::MD5Result Hash; + Hasher.update(Str); + Hasher.final(Hash); + return llvm::utohexstr(Hash.low(), /*LowerCase=*/true); +} + +static bool createInitOrFiniGlobls(Module &M, StringRef GlobalName, + bool IsCtor) { + GlobalVariable *GV = M.getGlobalVariable(GlobalName); + if (!GV || !GV->hasInitializer()) + return false; + ConstantArray *GA = dyn_cast(GV->getInitializer()); + if (!GA || GA->getNumOperands() == 0) + return false; + + // NVPTX has no way to emit variables at specific sections or support for + // the traditional constructor sections. Instead, we emit mangled global + // names so the runtime can build the list manually. + for (Value *V : GA->operands()) { + auto *CS = cast(V); + auto *F = cast(CS->getOperand(1)); + uint64_t Priority = cast(CS->getOperand(0))->getSExtValue(); + std::string PriorityStr = "." + std::to_string(Priority); + // We append a semi-unique hash and the priority to the global name. + std::string GlobalID = + !GlobalStr.empty() ? GlobalStr : getHash(M.getSourceFileName()); + std::string NameStr = + ((IsCtor ? "__init_array_object_" : "__fini_array_object_") + + F->getName() + "_" + GlobalID + "_" + std::to_string(Priority)) + .str(); + // PTX does not support exported names with '.' in them. + llvm::transform(NameStr, NameStr.begin(), + [](char c) { return c == '.' ? '_' : c; }); + + auto *GV = new GlobalVariable(M, F->getType(), /*IsConstant=*/true, + GlobalValue::ExternalLinkage, F, NameStr, + nullptr, GlobalValue::NotThreadLocal, + /*AddressSpace=*/4); + // This isn't respected by Nvidia, simply put here for clarity. + GV->setSection(IsCtor ? ".init_array" + PriorityStr + : ".fini_array" + PriorityStr); + GV->setVisibility(GlobalVariable::ProtectedVisibility); + appendToUsed(M, {GV}); + } + + GV->eraseFromParent(); + return true; +} + +static bool lowerCtorsAndDtors(Module &M) { + bool Modified = false; + Modified |= createInitOrFiniGlobls(M, "llvm.global_ctors", /*IsCtor =*/true); + Modified |= createInitOrFiniGlobls(M, "llvm.global_dtors", /*IsCtor =*/false); + return Modified; +} + +class NVPTXCtorDtorLoweringLegacy final : public ModulePass { +public: + static char ID; + NVPTXCtorDtorLoweringLegacy() : ModulePass(ID) {} + bool runOnModule(Module &M) override { return lowerCtorsAndDtors(M); } +}; + +} // End anonymous namespace + +PreservedAnalyses NVPTXCtorDtorLoweringPass::run(Module &M, + ModuleAnalysisManager &AM) { + return lowerCtorsAndDtors(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} + +char NVPTXCtorDtorLoweringLegacy::ID = 0; +char &llvm::NVPTXCtorDtorLoweringLegacyPassID = NVPTXCtorDtorLoweringLegacy::ID; +INITIALIZE_PASS(NVPTXCtorDtorLoweringLegacy, DEBUG_TYPE, + "Lower ctors and dtors for NVPTX", false, false) + +ModulePass *llvm::createNVPTXCtorDtorLoweringLegacyPass() { + return new NVPTXCtorDtorLoweringLegacy(); +} diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -15,6 +15,7 @@ #include "NVPTXAliasAnalysis.h" #include "NVPTXAllocaHoisting.h" #include "NVPTXAtomicLower.h" +#include "NVPTXCtorDtorLowering.h" #include "NVPTXLowerAggrCopies.h" #include "NVPTXMachineFunctionInfo.h" #include "NVPTXTargetObjectFile.h" @@ -68,8 +69,10 @@ void initializeNVPTXAllocaHoistingPass(PassRegistry &); void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); void initializeNVPTXAtomicLowerPass(PassRegistry &); +void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); void initializeNVPTXLowerAllocaPass(PassRegistry &); +void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); void initializeNVPTXLowerArgsPass(PassRegistry &); void initializeNVPTXProxyRegErasurePass(PassRegistry &); void initializeNVVMIntrRangePass(PassRegistry &); @@ -95,6 +98,7 @@ initializeNVPTXAtomicLowerPass(PR); initializeNVPTXLowerArgsPass(PR); initializeNVPTXLowerAllocaPass(PR); + initializeNVPTXCtorDtorLoweringLegacyPass(PR); initializeNVPTXLowerAggrCopiesPass(PR); initializeNVPTXProxyRegErasurePass(PR); initializeNVPTXDAGToDAGISelPass(PR); @@ -249,6 +253,10 @@ PB.registerPipelineParsingCallback( [](StringRef PassName, ModulePassManager &PM, ArrayRef) { + if (PassName == "nvptx-lower-ctor-dtor") { + PM.addPass(NVPTXCtorDtorLoweringPass()); + return true; + } if (PassName == "generic-to-nvvm") { PM.addPass(GenericToNVVMPass()); return true; @@ -369,6 +377,7 @@ } addPass(createAtomicExpandPass()); + addPass(createNVPTXCtorDtorLoweringLegacyPass()); // === LSR and other generic IR passes === TargetPassConfig::addIRPasses(); diff --git a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll @@ -0,0 +1,32 @@ +; RUN: opt -S -mtriple=nvptx64-- -nvptx-lower-ctor-dtor < %s | FileCheck %s +; RUN: opt -S -mtriple=nvptx64-- -passes=nvptx-lower-ctor-dtor < %s | FileCheck %s +; RUN: opt -S -mtriple=nvptx64-- -passes=nvptx-lower-ctor-dtor \ +; RUN: -nvptx-lower-global-ctor-dtor-id=unique_id < %s | FileCheck %s --check-prefix=GLOBAL + +; Make sure we get the same result if we run multiple times +; RUN: opt -S -mtriple=nvptx64-- -passes=nvptx-lower-ctor-dtor,nvptx-lower-ctor-dtor < %s | FileCheck %s +; RUN: llc -nvptx-lower-global-ctor-dtor -mtriple=nvptx64-amd-amdhsa -mcpu=sm_70 -filetype=asm -o - < %s | FileCheck %s -check-prefix=VISIBILITY + +@llvm.global_ctors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }] +@llvm.global_dtors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }] + +; CHECK-NOT: @llvm.global_ctors +; CHECK-NOT: @llvm.global_dtors + +; CHECK: @__init_array_object_foo_[[HASH:[0-9a-f]+]]_1 = protected addrspace(4) constant ptr @foo, section ".init_array.1" +; CHECK: @__fini_array_object_bar_[[HASH:[0-9a-f]+]]_1 = protected addrspace(4) constant ptr @bar, section ".fini_array.1" +; CHECK: @llvm.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(4) @__init_array_object_foo_[[HASH]]_1 to ptr), ptr addrspacecast (ptr addrspace(4) @__fini_array_object_bar_[[HASH]]_1 to ptr)], section "llvm.metadata" +; GLOBAL: @__init_array_object_foo_unique_id_1 = protected addrspace(4) constant ptr @foo, section ".init_array.1" +; GLOBAL: @__fini_array_object_bar_unique_id_1 = protected addrspace(4) constant ptr @bar, section ".fini_array.1" +; GLOBAL: @llvm.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(4) @__init_array_object_foo_unique_id_1 to ptr), ptr addrspacecast (ptr addrspace(4) @__fini_array_object_bar_unique_id_1 to ptr)], section "llvm.metadata" + +; VISIBILITY: .visible .const .align 8 .u64 __init_array_object_foo_[[HASH:[0-9a-f]+]]_1 = foo; +; VISIBILITY: .visible .const .align 8 .u64 __fini_array_object_bar_[[HASH:[0-9a-f]+]]_1 = bar; + +define internal void @foo() { + ret void +} + +define internal void @bar() { + ret void +}