diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1046,6 +1046,14 @@ HelpText<"Path to libomptarget-amdgcn bitcode library">, Alias; def libomptarget_nvptx_bc_path_EQ : Joined<["--"], "libomptarget-nvptx-bc-path=">, Group, HelpText<"Path to libomptarget-nvptx bitcode library">; +def libomptarget_amdgpu_wrapper_bc_path_EQ : Joined<["--"], "libomptarget-amdgpu-wrapper-bc-path=">, + Group, HelpText<"Path to libomptarget-amdgpu math wrapper bitcode library">; +def libomptarget_nvptx_wrapper_bc_path_EQ : Joined<["--"], "libomptarget-nvptx-wrapper-bc-path=">, + Group, HelpText<"Path to libomptarget-nvptx math wrapper bitcode library">; +def libomptarget_amdgpu_math_bc_path_EQ : Joined<["--"], "libomptarget-amdgpu-math-bc-path=">, + Group, HelpText<"Path to libomptarget-amdgpu math bitcode library">; +def libomptarget_nvptx_math_bc_path_EQ : Joined<["--"], "libomptarget-nvptx-math-bc-path=">, + Group, HelpText<"Path to libomptarget-nvptx math bitcode library">; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, HelpText<"Print macro definitions in -E mode in addition to normal output">; def dI : Flag<["-"], "dI">, Group, Flags<[CC1Option]>, diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -285,6 +285,9 @@ if (DriverArgs.hasArg(options::OPT_nogpulib)) return; + if (DriverArgs.hasArg(options::OPT_fopenmp_device_libm)) + addOpenMPMathRTL(getDriver(), DriverArgs, CC1Args, getTriple()); + // Link the bitcode library late if we're using device LTO. if (getDriver().isUsingLTO(/* IsOffload */ true)) return; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8244,7 +8244,13 @@ if (CudaInstallation.isValid()) CmdArgs.push_back(Args.MakeArgString( "--cuda-path=" + CudaInstallation.getInstallPath())); - break; + + if (D.isUsingLTO(/* IsOffload */ true)) { + const ArgList &TCArgs = + C.getArgsForToolChain(TC, "", Action::OFK_OpenMP); + StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ); + std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(Arch); + } } } @@ -8305,6 +8311,21 @@ CmdArgs.push_back( Args.MakeArgString("-target-library=" + TC->getTripleString() + "-" + Arch + "=" + BitcodeLibrary.back())); + + ArgStringList MathLibrary; + addOpenMPMathRTL(TCDriver, TCArgs, MathLibrary, TC->getTriple(), true); + + if (!MathLibrary.empty()) + CmdArgs.push_back( + Args.MakeArgString("-target-library=" + TC->getTripleString() + + "-" + Arch + "=" + MathLibrary.back())); + + CudaInstallationDetector CudaInstallation(D, TheTriple, Args); + std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(Arch); + if (!LibDeviceFile.empty()) + CmdArgs.push_back( + Args.MakeArgString("-target-library=" + TC->getTripleString() + + "-" + Arch + "=" + LibDeviceFile)); } // Pass in the optimization level to use for LTO. diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h --- a/clang/lib/Driver/ToolChains/CommonArgs.h +++ b/clang/lib/Driver/ToolChains/CommonArgs.h @@ -189,6 +189,10 @@ llvm::opt::ArgStringList &CmdArgs, const llvm::Triple &Triple, bool IsLTO); +void addOpenMPMathRTL(const Driver &D, const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args, + const llvm::Triple &Triple, bool IncludeLibm = false); + void addOpenMPDeviceRTL(const Driver &D, const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, StringRef BitcodeSuffix, const llvm::Triple &Triple); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1990,6 +1990,94 @@ } } +static void +addBitcodeLibrary(const Driver &D, const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args, const llvm::Triple &Triple, + ArrayRef LibraryPaths, + StringRef BitcodeLibraryName, OptSpecifier BCPathOpt) { + // First check whether user specified the bc library. + if (const Arg *A = DriverArgs.getLastArg(BCPathOpt)) { + SmallString<128> LibOmpTargetFile(A->getValue()); + if (llvm::sys::fs::exists(LibOmpTargetFile) && + llvm::sys::fs::is_directory(LibOmpTargetFile)) { + llvm::sys::path::append(LibOmpTargetFile, BitcodeLibraryName); + } + + if (llvm::sys::fs::exists(LibOmpTargetFile)) { + CC1Args.push_back("-mlink-builtin-bitcode"); + CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); + } else { + D.Diag(diag::err_drv_omp_offload_target_bcruntime_not_found) + << LibOmpTargetFile; + } + return; + } + + bool FoundBCLibrary = false; + + for (StringRef LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, BitcodeLibraryName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { + CC1Args.push_back("-mlink-builtin-bitcode"); + CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); + FoundBCLibrary = true; + break; + } + } + + if (!FoundBCLibrary) + D.Diag(diag::err_drv_omp_offload_target_missingbcruntime) + << BitcodeLibraryName << (Triple.isAMDGCN() ? "amdgpu" : "nvptx"); +} + +void tools::addOpenMPMathRTL(const Driver &D, + const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args, + const llvm::Triple &Triple, bool IncludeLibm) { + SmallVector LibraryPaths; + + // Add path to clang lib / lib64 folder. + SmallString<256> DefaultLibPath = llvm::sys::path::parent_path(D.Dir); + llvm::sys::path::append(DefaultLibPath, Twine("lib") + CLANG_LIBDIR_SUFFIX); + LibraryPaths.emplace_back(DefaultLibPath.c_str()); + + // Add user defined library paths from LIBRARY_PATH. + llvm::Optional LibPath = + llvm::sys::Process::GetEnv("LIBRARY_PATH"); + if (LibPath) { + SmallVector Frags; + const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'}; + llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr); + for (StringRef Path : Frags) + LibraryPaths.emplace_back(Path.trim()); + } + + StringRef ArchPrefix = Triple.isAMDGCN() ? "amdgpu" : "nvptx"; + + OptSpecifier MathWrapperBCPathOpt = + Triple.isAMDGCN() ? options::OPT_libomptarget_amdgpu_wrapper_bc_path_EQ + : options::OPT_libomptarget_nvptx_wrapper_bc_path_EQ; + std::string MathWrapperName = + ("libomptarget-" + ArchPrefix + "-math-wrappers.bc").str(); + + addBitcodeLibrary(D, DriverArgs, CC1Args, Triple, LibraryPaths, + MathWrapperName, MathWrapperBCPathOpt); + + // If we are doing LTO only link the OpenMP math wrappers. + if (D.isUsingLTO(/* IsOffload */ true) && !IncludeLibm) + return; + + OptSpecifier MathBCPathOpt = + Triple.isAMDGCN() ? options::OPT_libomptarget_amdgpu_math_bc_path_EQ + : options::OPT_libomptarget_nvptx_math_bc_path_EQ; + std::string DeviceMathName = + ("libomptarget-" + ArchPrefix + "-libm.bc").str(); + + addBitcodeLibrary(D, DriverArgs, CC1Args, Triple, LibraryPaths, + DeviceMathName, MathBCPathOpt); +} + void tools::addOpenMPDeviceRTL(const Driver &D, const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, @@ -2021,37 +2109,6 @@ std::string LibOmpTargetName = ("libomptarget-" + ArchPrefix + "-" + BitcodeSuffix + ".bc").str(); - // First check whether user specifies bc library - if (const Arg *A = DriverArgs.getLastArg(LibomptargetBCPathOpt)) { - SmallString<128> LibOmpTargetFile(A->getValue()); - if (llvm::sys::fs::exists(LibOmpTargetFile) && - llvm::sys::fs::is_directory(LibOmpTargetFile)) { - llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); - } - - if (llvm::sys::fs::exists(LibOmpTargetFile)) { - CC1Args.push_back("-mlink-builtin-bitcode"); - CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); - } else { - D.Diag(diag::err_drv_omp_offload_target_bcruntime_not_found) - << LibOmpTargetFile; - } - } else { - bool FoundBCLibrary = false; - - for (StringRef LibraryPath : LibraryPaths) { - SmallString<128> LibOmpTargetFile(LibraryPath); - llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); - if (llvm::sys::fs::exists(LibOmpTargetFile)) { - CC1Args.push_back("-mlink-builtin-bitcode"); - CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); - FoundBCLibrary = true; - break; - } - } - - if (!FoundBCLibrary) - D.Diag(diag::err_drv_omp_offload_target_missingbcruntime) - << LibOmpTargetName << ArchPrefix; - } + addBitcodeLibrary(D, DriverArgs, CC1Args, Triple, LibraryPaths, + LibOmpTargetName, LibomptargetBCPathOpt); } diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -696,9 +696,6 @@ return; } - CC1Args.push_back("-mlink-builtin-bitcode"); - CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); - clang::CudaVersion CudaInstallationVersion = CudaInstallation.version(); // New CUDA versions often introduce new instructions that are only supported @@ -744,12 +741,19 @@ return; } + if (DriverArgs.hasArg(options::OPT_fopenmp_device_libm)) + addOpenMPMathRTL(getDriver(), DriverArgs, CC1Args, getTriple()); + // Link the bitcode library late if we're using device LTO. if (getDriver().isUsingLTO(/* IsOffload */ true)) return; addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, GpuArch.str(), getTriple()); + + CC1Args.push_back("-mlink-builtin-bitcode"); + CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); + AddStaticDeviceLibsPostLinking(getDriver(), DriverArgs, CC1Args, "nvptx", GpuArch, /*isBitCodeSDL=*/true, /*postClangLink=*/true); diff --git a/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-math.bc b/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-math.bc new file mode 100644 diff --git a/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-wrapper.bc b/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-wrapper.bc new file mode 100644 diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c --- a/clang/test/Driver/openmp-offload-gpu.c +++ b/clang/test/Driver/openmp-offload-gpu.c @@ -346,3 +346,13 @@ // RUN: | FileCheck -check-prefix=NEW_DRIVER_EMBEDDING %s // NEW_DRIVER_EMBEDDING: -fembed-offload-object=[[CUBIN:.*\.cubin]],nvptx64-nvidia-cuda.sm_70 + +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvida-cuda -march=sm_70 \ +// RUN: --libomptarget-nvptx-wrapper-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-wrapper.bc \ +// RUN: --libomptarget-nvptx-math-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-math.bc \ +// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-new-nvptx-test.bc \ +// RUN: -foffload-lto -fopenmp-device-libm -fopenmp-new-driver -no-canonical-prefixes %s -o openmp-offload-gpu 2>&1 \ +// RUN: | FileCheck -check-prefix=DEVICE_LIBM %s + +// DEVICE_LIBM: clang{{.*}}"-mlink-builtin-bitcode" "{{.*}}/Inputs/libomptarget/libomptarget-nvptx-wrapper.bc" +// DEVICE_LIBM: clang-linker-wrapper{{.*}}"-target-library=nvptx64-nvidia-cuda-sm_70={{.*}}/Inputs/libomptarget/libomptarget-new-nvptx-test.bc" "-target-library=nvptx64-nvidia-cuda-sm_70={{.*}}/Inputs/libomptarget/libomptarget-nvptx-math.bc"