Index: include/clang/Driver/Action.h =================================================================== --- include/clang/Driver/Action.h +++ include/clang/Driver/Action.h @@ -71,9 +71,10 @@ VerifyPCHJobClass, OffloadBundlingJobClass, OffloadUnbundlingJobClass, + PartialLinkerJobClass, JobClassFirst = PreprocessJobClass, - JobClassLast = OffloadUnbundlingJobClass + JobClassLast = PartialLinkerJobClass }; // The offloading kind determines if this action is binded to a particular @@ -589,6 +590,18 @@ } }; +class PartialLinkerJobAction : public JobAction { + void anchor() override; + +public: + // Partial linking does not change the type of output. + PartialLinkerJobAction(ActionList &Inputs); + + static bool classof(const Action *A) { + return A->getKind() == PartialLinkerJobClass; + } +}; + } // namespace driver } // namespace clang Index: include/clang/Driver/Compilation.h =================================================================== --- include/clang/Driver/Compilation.h +++ include/clang/Driver/Compilation.h @@ -122,6 +122,9 @@ /// Whether an error during the parsing of the input args. bool ContainsError; + /// Whether the clang-offload-bundler can be skipped. + bool SkipOffloadBundler = false; + public: Compilation(const Driver &D, const ToolChain &DefaultToolChain, llvm::opt::InputArgList *Args, @@ -301,6 +304,16 @@ /// of three. The inferior process's stdin(0), stdout(1), and stderr(2) will /// be redirected to the corresponding paths, if provided (not llvm::None). void Redirect(ArrayRef> Redirects); + + /// Set whether the compilation can avoid calling the clang-offload-bundler + /// for object file types. + /// + /// \param skipBundler - bool value set once by the driver. + void setSkipOffloadBundler(bool skipBundler); + + /// Returns true when calls to the clang-offload-bundler are not required + /// for object types. + bool canSkipOffloadBundler() const; }; } // namespace driver Index: include/clang/Driver/Driver.h =================================================================== --- include/clang/Driver/Driver.h +++ include/clang/Driver/Driver.h @@ -256,7 +256,7 @@ llvm::opt::DerivedArgList * TranslateInputArgs(const llvm::opt::InputArgList &Args) const; - // getFinalPhase - Determine which compilation mode we are in and record + // getFinalPhase - Determine which compilation mode we are in and record // which option we used to determine the final phase. phases::ID getFinalPhase(const llvm::opt::DerivedArgList &DAL, llvm::opt::Arg **FinalPhaseArg = nullptr) const; @@ -363,12 +363,12 @@ llvm::opt::InputArgList ParseArgStrings(ArrayRef Args, bool &ContainsError); - /// BuildInputs - Construct the list of inputs and their types from + /// BuildInputs - Construct the list of inputs and their types from /// the given arguments. /// /// \param TC - The default host tool chain. /// \param Args - The input arguments. - /// \param Inputs - The list to store the resulting compilation + /// \param Inputs - The list to store the resulting compilation /// inputs onto. void BuildInputs(const ToolChain &TC, llvm::opt::DerivedArgList &Args, InputList &Inputs) const; @@ -491,7 +491,7 @@ /// \param JA - The action of interest. /// \param BaseInput - The original input file that this action was /// triggered by. - /// \param BoundArch - The bound architecture. + /// \param BoundArch - The bound architecture. /// \param AtTopLevel - Whether this is a "top-level" action. /// \param MultipleArchs - Whether multiple -arch options were supplied. /// \param NormalizedTriple - The normalized triple of the relevant target. @@ -500,7 +500,7 @@ bool AtTopLevel, bool MultipleArchs, StringRef NormalizedTriple) const; - /// GetTemporaryPath - Return the pathname of a temporary file to use + /// GetTemporaryPath - Return the pathname of a temporary file to use /// as part of compilation; the file will have the given prefix and suffix. /// /// GCC goes to extra lengths here to be a bit more robust. Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -1486,6 +1486,10 @@ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fno_openmp_cuda_mode : Flag<["-"], "fno-openmp-cuda-mode">, Group, Flags<[NoArgumentUnused, HelpHidden]>; +def fopenmp_use_target_bundling : Flag<["-"], "fopenmp-use-target-bundling">, Group, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; +def fno_openmp_use_target_bundling : Flag<["-"], "fno-openmp-use-target-bundling">, Group, + Flags<[NoArgumentUnused, HelpHidden]>; def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group; def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group; def fno_escaping_block_tail_calls : Flag<["-"], "fno-escaping-block-tail-calls">, Group, Flags<[CC1Option]>; Index: include/clang/Driver/ToolChain.h =================================================================== --- include/clang/Driver/ToolChain.h +++ include/clang/Driver/ToolChain.h @@ -126,12 +126,14 @@ mutable std::unique_ptr Assemble; mutable std::unique_ptr Link; mutable std::unique_ptr OffloadBundler; + mutable std::unique_ptr PartialLinker; Tool *getClang() const; Tool *getAssemble() const; Tool *getLink() const; Tool *getClangAs() const; Tool *getOffloadBundler() const; + Tool *getPartialLinker() const; mutable std::unique_ptr SanitizerArguments; mutable std::unique_ptr XRayArguments; Index: lib/Driver/Action.cpp =================================================================== --- lib/Driver/Action.cpp +++ lib/Driver/Action.cpp @@ -40,6 +40,8 @@ return "clang-offload-bundler"; case OffloadUnbundlingJobClass: return "clang-offload-unbundler"; + case PartialLinkerJobClass: + return "partial-linker"; } llvm_unreachable("invalid class"); @@ -388,3 +390,8 @@ OffloadUnbundlingJobAction::OffloadUnbundlingJobAction(Action *Input) : JobAction(OffloadUnbundlingJobClass, Input, Input->getType()) {} + +void PartialLinkerJobAction::anchor() {} + +PartialLinkerJobAction::PartialLinkerJobAction(ActionList &Inputs) + : JobAction(PartialLinkerJobClass, Inputs, Inputs.front()->getType()) {} Index: lib/Driver/Compilation.cpp =================================================================== --- lib/Driver/Compilation.cpp +++ lib/Driver/Compilation.cpp @@ -271,3 +271,11 @@ void Compilation::Redirect(ArrayRef> Redirects) { this->Redirects = Redirects; } + +void Compilation::setSkipOffloadBundler(bool skipBundler) { + SkipOffloadBundler = skipBundler; +} + +bool Compilation::canSkipOffloadBundler() const { + return SkipOffloadBundler; +} Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -2683,7 +2683,8 @@ /// results will be kept in this action builder. Return true if an error was /// found. bool addHostDependenceToDeviceActions(Action *&HostAction, - const Arg *InputArg) { + const Arg *InputArg, + bool SkipBundler) { if (!IsValid) return true; @@ -2695,7 +2696,8 @@ // the input is not a bundle. if (CanUseBundler && isa(HostAction) && InputArg->getOption().getKind() == llvm::opt::Option::InputClass && - !types::isSrcFile(HostAction->getType())) { + !types::isSrcFile(HostAction->getType()) && + !SkipBundler) { auto UnbundlingHostAction = C.MakeAction(HostAction); UnbundlingHostAction->registerDependentActionInfo( @@ -2732,7 +2734,7 @@ /// function can replace the host action by a bundling action if the /// programming models allow it. bool appendTopLevelActions(ActionList &AL, Action *HostAction, - const Arg *InputArg) { + const Arg *InputArg, bool usePartialLinkStep) { // Get the device actions to be appended. ActionList OffloadAL; for (auto *SB : SpecializedBuilders) { @@ -2750,7 +2752,10 @@ // We expect that the host action was just appended to the action list // before this method was called. assert(HostAction == AL.back() && "Host action not in the list??"); - HostAction = C.MakeAction(OffloadAL); + if (usePartialLinkStep) + HostAction = C.MakeAction(OffloadAL); + else + HostAction = C.MakeAction(OffloadAL); AL.back() = HostAction; } else AL.append(OffloadAL.begin(), OffloadAL.end()); @@ -2913,6 +2918,52 @@ YcArg = YuArg = nullptr; } + // Determine whether the bundler tool can be skipped based on the set + // of triples provided to the -fopenmp-targets flag, if it is present. + bool CanSkipClangOffloadBundler = false; + if (!Args.hasArg(options::OPT_fopenmp_use_target_bundling)) { + if (Arg *OpenMPTargets = C.getInputArgs().getLastArg( + options::OPT_fopenmp_targets_EQ)) { + if (OpenMPTargets->getValues().size() > 0) { + unsigned triplesRequiringBundler = 0; + for (const char *Val : OpenMPTargets->getValues()) { + llvm::Triple TT(Val); + + // If the list of tripled contains an invalid triple or + // contains a valid non-NVPTX triple then the bundler + // can be used. + if (TT.getArch() == llvm::Triple::UnknownArch || + (TT.getArch() != llvm::Triple::UnknownArch && + !TT.isNVPTX())) { + triplesRequiringBundler++; + } + } + CanSkipClangOffloadBundler = (triplesRequiringBundler == 0); + C.setSkipOffloadBundler(CanSkipClangOffloadBundler); + } + } + } + + // Determine whether a linker which supports partial linking + // exists. On linux systems ld provides this functionality, there + // may be other linkers that work also. + // TODO: test if linker supports partial linking i.e. -r + // We know ld does so we will actually check if the linker + // is ld instead but this needs to be replaced. + bool CanDoPartialLinking = false; + if (CanSkipClangOffloadBundler && + C.getInputArgs().hasArg(options::OPT_c)) { + // The bundler can be replaced with a partilal linking step + // only when outputing an object. For all other cases the + // fallback solution is the clang-offload-bundler. + StringRef LinkerName = C.getDefaultToolChain().GetLinkerPath(); + + // TODO: test if linker supports partial linking i.e. -r + // We know ld does so we will actually check if the linker + // is ld instead but this needs to be replaced. + CanDoPartialLinking = LinkerName.endswith("/ld"); + } + // Builder to be used to build offloading actions. OffloadingActionBuilder OffloadBuilder(C, Args, Inputs); @@ -2988,7 +3039,13 @@ // Use the current host action in any of the offloading actions, if // required. - if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg)) + // The action may contain a bundling step which should not be executed + // if the toolchain we are targeting can produce object files that + // are understood by the host linker. + bool SkipBundler = (InputType == types::TY_Object) && + CanSkipClangOffloadBundler; + if (OffloadBuilder.addHostDependenceToDeviceActions( + Current, InputArg, SkipBundler)) break; for (SmallVectorImpl::iterator i = PL.begin(), e = PL.end(); @@ -3024,7 +3081,8 @@ // Use the current host action in any of the offloading actions, if // required. - if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg)) + if (OffloadBuilder.addHostDependenceToDeviceActions( + Current, InputArg, SkipBundler)) break; if (Current->getType() == types::TY_Nothing) @@ -3036,7 +3094,8 @@ Actions.push_back(Current); // Add any top level actions generated for offloading. - OffloadBuilder.appendTopLevelActions(Actions, Current, InputArg); + OffloadBuilder.appendTopLevelActions(Actions, Current, InputArg, + CanDoPartialLinking); } // Add a link action if necessary. @@ -3586,6 +3645,7 @@ InputInfoList OffloadDependencesInputInfo; bool BuildingForOffloadDevice = TargetDeviceOffloadKind != Action::OFK_None; + if (const OffloadAction *OA = dyn_cast(A)) { // The 'Darwin' toolchain is initialized only when its arguments are // computed. Get the default arguments for OFK_None to ensure that Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -272,6 +272,12 @@ return OffloadBundler.get(); } +Tool *ToolChain::getPartialLinker() const { + if (!PartialLinker) + PartialLinker.reset(new tools::PartialLinker(*this)); + return PartialLinker.get(); +} + Tool *ToolChain::getTool(Action::ActionClass AC) const { switch (AC) { case Action::AssembleJobClass: @@ -300,6 +306,10 @@ case Action::OffloadBundlingJobClass: case Action::OffloadUnbundlingJobClass: return getOffloadBundler(); + + case Action::PartialLinkerJobClass: + return getPartialLinker(); + } llvm_unreachable("Invalid tool kind."); @@ -553,7 +563,7 @@ StringRef Suffix = tools::arm::getLLVMArchSuffixForARM(CPU, MArch, Triple); bool IsMProfile = ARM::parseArchProfile(Suffix) == ARM::ProfileKind::M; - bool ThumbDefault = IsMProfile || (ARM::parseArchVersion(Suffix) == 7 && + bool ThumbDefault = IsMProfile || (ARM::parseArchVersion(Suffix) == 7 && getTriple().isOSBinFormatMachO()); // FIXME: this is invalid for WindowsCE if (getTriple().isOSWindows()) Index: lib/Driver/ToolChains/Clang.h =================================================================== --- lib/Driver/ToolChains/Clang.h +++ lib/Driver/ToolChains/Clang.h @@ -147,6 +147,19 @@ const llvm::opt::ArgList &TCArgs, const char *LinkingOutput) const override; }; + +/// Partial linker tool. +class LLVM_LIBRARY_VISIBILITY PartialLinker final : public Tool { +public: + PartialLinker(const ToolChain &TC) + : Tool("PartialLinker", "partial-linker", TC) {} + + bool hasIntegratedCPP() const override { return false; } + void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; } // end namespace tools } // end namespace driver Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -5478,6 +5478,49 @@ C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); } +// Begin partial linking + +void PartialLinker::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const { + // The version with only one output is expected to refer to a bundling job. + assert(isa(JA) && "Expecting partial linking job!"); + + // The partial linking command line (using ld as example): + // ld -r input1.o input2.o -o single-file.o + ArgStringList CmdArgs; + + // Ensure conditions are met for doing partial linking instead of bundling. + assert(TCArgs.hasArg(options::OPT_c) && + "Can only use partial linking for object file generation."); + assert(C.canSkipOffloadBundler() && + "Offload bundler cannot be skipped."); + + // TODO: the assert may be removed once a more elaborate checking is in + // place in the Driver. + StringRef LinkerName = getToolChain().GetLinkerPath(); + assert(LinkerName.endswith("/ld") && "Partial linking not supported."); + + // Enable partial linking. + CmdArgs.push_back(TCArgs.MakeArgString("-r")); + + // Add input files. + for (unsigned I = 0; I < Inputs.size(); ++I) { + CmdArgs.push_back(TCArgs.MakeArgString(Inputs[I].getFilename())); + } + + // Add output file. + CmdArgs.push_back(TCArgs.MakeArgString("-o")); + CmdArgs.push_back(TCArgs.MakeArgString(Output.getFilename())); + + // Add partial linker command. + C.addCommand(llvm::make_unique( + JA, *this, TCArgs.MakeArgString(getToolChain().GetLinkerPath()), + CmdArgs, None)); +} + // Begin OffloadBundler void OffloadBundler::ConstructJob(Compilation &C, const JobAction &JA, Index: lib/Driver/ToolChains/Cuda.cpp =================================================================== --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -382,7 +382,8 @@ CmdArgs.push_back("--gpu-name"); CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch))); CmdArgs.push_back("--output-file"); - CmdArgs.push_back(Args.MakeArgString(TC.getInputFilename(Output))); + const char *CubinF = Args.MakeArgString(TC.getInputFilename(Output)); + CmdArgs.push_back(CubinF); for (const auto& II : Inputs) CmdArgs.push_back(Args.MakeArgString(II.getFilename())); @@ -408,6 +409,130 @@ else Exec = Args.MakeArgString(TC.GetProgramPath("ptxas")); C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + // For OpenMP targets offloaded to an NVIDIA device offloading, call the + // NVIDIA tools that make the object file discoverable by NVLINK. + // Wrap the resulting fatbinary file into a host-friendly object file to + // be linked with the host object file. + if (JA.isDeviceOffloading(Action::OFK_OpenMP) && + Args.hasArg(options::OPT_c) && + C.canSkipOffloadBundler()) { + ArgStringList FatbinaryCmdArgs; + FatbinaryCmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32"); + + ArgStringList CompilerCmdArgs; + CompilerCmdArgs.push_back(Args.MakeArgString("-c")); + CompilerCmdArgs.push_back(Args.MakeArgString("-o")); + CompilerCmdArgs.push_back(Args.MakeArgString(Output.getFilename())); + CompilerCmdArgs.push_back(Args.MakeArgString(llvm::Twine("-I") + + TC.CudaInstallation.getBinPath() + llvm::Twine("/../include"))); + + // Create fatbin file using fatbinary executable. + SmallString<128> OrigOutputFileName = + llvm::sys::path::filename(Output.getFilename()); + + // Create fatbin file. + const char *FatbinF; + if (C.getDriver().isSaveTempsEnabled()) { + llvm::sys::path::replace_extension(OrigOutputFileName, "fatbin"); + FatbinF = C.getArgs().MakeArgString(OrigOutputFileName.c_str()); + } else { + llvm::sys::path::replace_extension(OrigOutputFileName, ""); + OrigOutputFileName = + C.getDriver().GetTemporaryPath(OrigOutputFileName, "fatbin"); + FatbinF = + C.addTempFile(C.getArgs().MakeArgString(OrigOutputFileName.c_str())); + } + FatbinaryCmdArgs.push_back( + Args.MakeArgString(llvm::Twine("--create=") + FatbinF)); + + // Create fatbin file wrapper using fatbinary executable. + const char *WrappedFatbinF; + llvm::sys::path::replace_extension(OrigOutputFileName, "fatbin.c"); + if (C.getDriver().isSaveTempsEnabled()) + WrappedFatbinF = C.getArgs().MakeArgString(OrigOutputFileName); + else + WrappedFatbinF = + C.addTempFile(C.getArgs().MakeArgString(OrigOutputFileName)); + FatbinaryCmdArgs.push_back( + Args.MakeArgString(llvm::Twine("--embedded-fatbin=") + + WrappedFatbinF)); + + // Continue assembling the host compiler arguments. + CompilerCmdArgs.push_back(Args.MakeArgString(WrappedFatbinF)); + + StringRef GPUArch = Args.getLastArgValue(options::OPT_march_EQ); + assert(!GPUArch.empty() && "At least one GPU Arch required for nvlink."); + + for (const auto& II : Inputs) { + SmallString<128> OrigInputFileName = + llvm::sys::path::filename(II.getFilename()); + + if (II.getType() == types::TY_LLVM_IR || + II.getType() == types::TY_LTO_IR || + II.getType() == types::TY_LTO_BC || + II.getType() == types::TY_LLVM_BC) { + C.getDriver().Diag(diag::err_drv_no_linker_llvm_support) + << getToolChain().getTripleString(); + continue; + } + + // Currently, we only pass the input files to the linker, we do not pass + // any libraries that may be valid only for the host. Any static + // libraries will be handled at the link stage. + if (!II.isFilename() || OrigInputFileName.endswith(".a")) + continue; + + auto *A = II.getAction(); + assert(A->getInputs().size() == 1 && + "Device offload action is expected to have a single input"); + CudaArch gpu_arch = StringToCudaArch(GPUArch); + + // We need to pass an Arch of the form "sm_XX" for cubin files and + // "compute_XX" for ptx. + const char *Arch = + (II.getType() == types::TY_PP_Asm) + ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch)) + : GPUArch.str().c_str(); + const char *PtxF = + C.addTempFile(C.getArgs().MakeArgString(II.getFilename())); + FatbinaryCmdArgs.push_back("--cmdline=--compile-only"); + FatbinaryCmdArgs.push_back( + Args.MakeArgString(llvm::Twine("--image=profile=") + + Arch + ",file=" + PtxF)); + FatbinaryCmdArgs.push_back( + Args.MakeArgString(llvm::Twine("--image=profile=") + + GPUArch.str().c_str() + "@" + Arch + ",file=" + CubinF)); + } + + FatbinaryCmdArgs.push_back(Args.MakeArgString("--cuda")); + FatbinaryCmdArgs.push_back(Args.MakeArgString("--device-c")); + + for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary)) + FatbinaryCmdArgs.push_back(Args.MakeArgString(A)); + + // fatbinary --create=ompprint.fatbin -64 + // --image=profile=compute_35,file=ompprint.compute_35.ptx + // --image=profile=sm_35@compute_35,file=ompprint.compute_35.sm_35.cubin + // --embedded-fatbin=ompprint.fatbin.c --cuda --device-c + const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary")); + C.addCommand(llvm::make_unique( + JA, *this, Exec, FatbinaryCmdArgs, Inputs)); + + // Come up with a unique name for the fatbin segment. The name uses + // the hash of the full path of the file. + std::hash hash_fn; + size_t hash = hash_fn(llvm::sys::path::filename(Output.getFilename())); + CompilerCmdArgs.push_back( + Args.MakeArgString(llvm::Twine("-D__NV_MODULE_ID=") + + llvm::Twine(hash))); + + // clang++ -c ompprint.fatbin.c -I/path/to/cuda/include/dir + const char *CompilerExec = + Args.MakeArgString(TC.GetProgramPath("clang++")); + C.addCommand(llvm::make_unique( + JA, *this, CompilerExec, CompilerCmdArgs, Inputs)); + } } static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) { @@ -512,6 +637,9 @@ // Add paths specified in LIBRARY_PATH environment variable as -L options. addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH"); + if (C.canSkipOffloadBundler()) + Args.AddAllArgs(CmdArgs, options::OPT_L); + // Add paths for the default clang library path. SmallString<256> DefaultLibPath = llvm::sys::path::parent_path(TC.getDriver().Dir); @@ -531,15 +659,37 @@ continue; } - // Currently, we only pass the input files to the linker, we do not pass - // any libraries that may be valid only for the host. - if (!II.isFilename()) + if (!II.isFilename()) { + // Anything that's not a file name is potentially a static library + // so treat it as such. + if (C.canSkipOffloadBundler()) + CmdArgs.push_back(C.getArgs().MakeArgString(llvm::Twine("-l") + + II.getInputArg().getValue())); continue; + } - const char *CubinF = C.addTempFile( - C.getArgs().MakeArgString(getToolChain().getInputFilename(II))); - - CmdArgs.push_back(CubinF); + StringRef OrigInputFileName = + llvm::sys::path::filename(II.getBaseInput()); + if (OrigInputFileName.endswith(".a")) { + const char *StaticLibName = + C.getArgs().MakeArgString(II.getFilename()); + CmdArgs.push_back(StaticLibName); + } else { + // If the original input is not an object file then it means the + // assembly step has actually produced a cubin so we need to + // rename it accordingly. + if ((!C.canSkipOffloadBundler() && OrigInputFileName.endswith(".o")) || + (C.canSkipOffloadBundler() && !OrigInputFileName.endswith(".o"))) { + // Create cubin file name and add it as a temporary file. + SmallString<256> Filename(II.getFilename()); + llvm::sys::path::replace_extension(Filename, "cubin"); + const char *CubinF = C.addTempFile( + C.getArgs().MakeArgString(Filename.str())); + CmdArgs.push_back(CubinF); + } else { + CmdArgs.push_back(II.getFilename()); + } + } } AddOpenMPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA); Index: test/Driver/openmp-offload-gpu-linux.c =================================================================== --- /dev/null +++ test/Driver/openmp-offload-gpu-linux.c @@ -0,0 +1,52 @@ +/// +/// Perform driver tests for OpenMP offloading on Linux systems +/// + +// UNSUPPORTED: system-windows + +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: powerpc-registered-target +// REQUIRES: nvptx-registered-target + +/// Check cubin file generation and partial linking with ld +// RUN: %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -no-canonical-prefixes -save-temps %s -c 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-PTXAS-CUBIN-BUNDLING %s + +// CHK-PTXAS-CUBIN-BUNDLING: clang{{.*}}" "-o" "[[PTX:.*\.s]]" +// CHK-PTXAS-CUBIN-BUNDLING-NEXT: ptxas{{.*}}" "--output-file" "[[CUBIN:.*\.cubin]]" {{.*}}"[[PTX]]" +// CHK-PTXAS-CUBIN-BUNDLING: fatbinary{{.*}}" "--create=[[FATBIN:.*\.fatbin]]" " +// CHK-PTXAS-CUBIN-BUNDLING-SAME: --embedded-fatbin=[[FATBINC:.*\.fatbin.c]]" " +// CHK-PTXAS-CUBIN-BUNDLING-SAME: --cmdline=--compile-only" "--image=profile={{.*}}[[PTX]]" " +// CHK-PTXAS-CUBIN-BUNDLING-SAME: --image=profile={{.*}}file=[[CUBIN]]" "--cuda" "--device-c" +// CHK-PTXAS-CUBIN-BUNDLING: clang++{{.*}}" "-c" "-o" "[[HOSTDEV:.*\.o]]"{{.*}}" "[[FATBINC]]" "-D__NV_MODULE_ID= +// CHK-PTXAS-CUBIN-BUNDLING-NOT: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-inputs={{.*}}[[CUBIN]] +// CHK-PTXAS-CUBIN-BUNDLING: ld" "-r" "[[HOSTDEV]]" "{{.*}}.o" "-o" "{{.*}}.o" + +/// ########################################################################### + +/// Check object file unbundling is not happening when skipping bundler +// RUN: touch %t.o +// RUN: %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -no-canonical-prefixes -save-temps %t.o 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s + +/// Use DAG to ensure that object file has not been unbundled. +// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: nvlink{{.*}}" {{.*}}"[[OBJ:.*\.o]]" +// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: ld{{.*}}" {{.*}}"[[OBJ]]" + +/// ########################################################################### + +/// Check object file generation is not happening when skipping bundler +// RUN: touch %t1.o +// RUN: touch %t2.o +// RUN: %clang -### -no-canonical-prefixes -target powerpc64le-unknown-linux-gnu -fopenmp=libomp \ +// RUN: -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-TWOCUBIN %s +/// Check cubin file generation and usage by nvlink when toolchain has BindArchAction +// RUN: %clang -### -no-canonical-prefixes -target x86_64-apple-darwin17.0.0 -fopenmp=libomp \ +// RUN: -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-TWOCUBIN %s + +// CHK-TWOCUBIN: nvlink{{.*}}openmp-offload-{{.*}}.o" "{{.*}}openmp-offload-{{.*}}.o" Index: test/Driver/openmp-offload-gpu.c =================================================================== --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -61,7 +61,7 @@ /// Check cubin file generation and bundling // RUN: %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ -// RUN: -no-canonical-prefixes -save-temps %s -c 2>&1 \ +// RUN: -no-canonical-prefixes -save-temps %s -c -fopenmp-use-target-bundling 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-PTXAS-CUBIN-BUNDLING %s // CHK-PTXAS-CUBIN-BUNDLING: clang{{.*}}" "-o" "[[PTX:.*\.s]]" @@ -73,7 +73,7 @@ /// Check cubin file unbundling and usage by nvlink // RUN: touch %t.o // RUN: %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ -// RUN: -no-canonical-prefixes -save-temps %t.o 2>&1 \ +// RUN: -no-canonical-prefixes -save-temps %t.o -fopenmp-use-target-bundling 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s /// Use DAG to ensure that cubin file has been unbundled. @@ -87,11 +87,11 @@ // RUN: touch %t1.o // RUN: touch %t2.o // RUN: %clang -### -no-canonical-prefixes -target powerpc64le-unknown-linux-gnu -fopenmp=libomp \ -// RUN: -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \ +// RUN: -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o -fopenmp-use-target-bundling 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-TWOCUBIN %s /// Check cubin file generation and usage by nvlink when toolchain has BindArchAction // RUN: %clang -### -no-canonical-prefixes -target x86_64-apple-darwin17.0.0 -fopenmp=libomp \ -// RUN: -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \ +// RUN: -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o -fopenmp-use-target-bundling 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-TWOCUBIN %s // CHK-TWOCUBIN: nvlink{{.*}}openmp-offload-{{.*}}.cubin" "{{.*}}openmp-offload-{{.*}}.cubin" Index: test/Driver/openmp-offload.c =================================================================== --- test/Driver/openmp-offload.c +++ test/Driver/openmp-offload.c @@ -480,13 +480,13 @@ // Create host object and bundle. // CHK-BUJOBS: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-emit-obj" {{.*}}"-fopenmp" {{.*}}"-o" " // CHK-BUJOBS-SAME: [[HOSTOBJ:[^\\/]+\.o]]" "-x" "ir" "{{.*}}[[HOSTBC]]" -// CHK-BUJOBS: clang-offload-bundler{{.*}}" "-type=o" "-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs= +// CHK-BUJOBS: clang-offload-bundler{{.*}}" "-type=o"{{.*}}"-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs= // CHK-BUJOBS-SAME: [[RES:[^\\/]+\.o]]" "-inputs={{.*}}[[T1OBJ]],{{.*}}[[T2OBJ]],{{.*}}[[HOSTOBJ]]" // CHK-BUJOBS-ST: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-S" {{.*}}"-fopenmp" {{.*}}"-o" " // CHK-BUJOBS-ST-SAME: [[HOSTASM:[^\\/]+\.s]]" "-x" "ir" "{{.*}}[[HOSTBC]]" // CHK-BUJOBS-ST: clang{{.*}}" "-cc1as" "-triple" "powerpc64le--linux" "-filetype" "obj" {{.*}}"-o" " // CHK-BUJOBS-ST-SAME: [[HOSTOBJ:[^\\/]+\.o]]" "{{.*}}[[HOSTASM]]" -// CHK-BUJOBS-ST: clang-offload-bundler{{.*}}" "-type=o" "-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs= +// CHK-BUJOBS-ST: clang-offload-bundler{{.*}}" "-type=o"{{.*}}"-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs= // CHK-BUJOBS-ST-SAME: [[RES:[^\\/]+\.o]]" "-inputs={{.*}}[[T1OBJ]],{{.*}}[[T2OBJ]],{{.*}}[[HOSTOBJ]]" /// ###########################################################################