diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4295,6 +4295,9 @@ // Select the appropriate action. RewriteKind rewriteKind = RK_None; + bool isDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) || + JA.isDeviceOffloading(Action::OFK_Host)); + // If CollectArgsForIntegratedAssembler() isn't called below, claim the args // it claims when not running an assembler. Otherwise, clang would emit // "argument unused" warnings for assembler flags when e.g. adding "-E" to @@ -4401,9 +4404,6 @@ CmdArgs.push_back("-emit-llvm-uselists"); // Device-side jobs do not support LTO. - bool isDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) || - JA.isDeviceOffloading(Action::OFK_Host)); - if (D.isUsingLTO() && !isDeviceOffloadAction) { Args.AddLastArg(CmdArgs, options::OPT_flto, options::OPT_flto_EQ); CmdArgs.push_back("-flto-unit"); @@ -4436,7 +4436,15 @@ // Add flags implied by -fembed-bitcode. Args.AddLastArg(CmdArgs, options::OPT_fembed_bitcode_EQ); // Disable all llvm IR level optimizations. - CmdArgs.push_back("-disable-llvm-passes"); + if (!isDeviceOffloadAction) { + CmdArgs.push_back("-disable-llvm-passes"); + } else { + std::string CPU = getCPUName(Args, Triple, /*FromAs*/ false); + if (!CPU.empty()) { + CmdArgs.push_back("-target-cpu"); + CmdArgs.push_back(Args.MakeArgString(CPU)); + } + } // Render target options. TC.addClangTargetOptions(Args, CmdArgs, JA.getOffloadingDeviceKind()); diff --git a/clang/test/Driver/embed-bitcode-nvptx.cu b/clang/test/Driver/embed-bitcode-nvptx.cu new file mode 100644 --- /dev/null +++ b/clang/test/Driver/embed-bitcode-nvptx.cu @@ -0,0 +1,8 @@ +// RUN: %clang -Xclang -triple -Xclang nvptx64 -S -Xclang -target-feature -Xclang +ptx70 -fembed-bitcode=all --cuda-device-only -nocudalib -nocudainc %s -o - | FileCheck %s +// REQUIRES: nvptx-registered-target +// +// CHECK:.global .align 1 .b8 llvm_$_embedded_$_module[ + +__device__ void foo(int mask) { + __nvvm_bar_warp_sync(mask); +}