diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4366,9 +4366,9 @@ IsHeaderModulePrecompile ? HeaderModuleInput : Inputs[0]; InputInfoList ModuleHeaderInputs; + InputInfoList OpenMPHostInputs; const InputInfo *CudaDeviceInput = nullptr; const InputInfo *OpenMPDeviceInput = nullptr; - const InputInfo *OpenMPHostInput = nullptr; for (const InputInfo &I : Inputs) { if (&I == &Input) { // This is the primary input. @@ -4385,8 +4385,8 @@ CudaDeviceInput = &I; } else if (IsOpenMPDevice && !OpenMPDeviceInput) { OpenMPDeviceInput = &I; - } else if (IsOpenMPHost && !OpenMPHostInput) { - OpenMPHostInput = &I; + } else if (IsOpenMPHost) { + OpenMPHostInputs.push_back(I); } else { llvm_unreachable("unexpectedly given multiple inputs"); } @@ -6894,6 +6894,25 @@ } } + // Host-side OpenMP offloading recieves the device object files and embeds it + // in a named section including the associated target triple and architecture. + if (IsOpenMPHost && !OpenMPHostInputs.empty()) { + auto InputFile = OpenMPHostInputs.begin(); + auto OpenMPTCs = C.getOffloadToolChains(); + for (auto TI = OpenMPTCs.first, TE = OpenMPTCs.second; TI != TE; + ++TI, ++InputFile) { + const ToolChain *TC = TI->second; + const ArgList &TCArgs = C.getArgsForToolChain(TC, "", Action::OFK_OpenMP); + StringRef File = + C.getArgs().MakeArgString(TC->getInputFilename(*InputFile)); + StringRef InputName = Clang::getBaseInputStem(Args, Inputs); + + CmdArgs.push_back(Args.MakeArgString( + "-fembed-offload-object=" + File + "," + TC->getTripleString() + "." + + TCArgs.getLastArgValue(options::OPT_march_EQ) + "." + InputName)); + } + } + if (Triple.isAMDGPU()) { handleAMDGPUCodeObjectVersionOptions(D, Args, CmdArgs); diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c --- a/clang/test/Driver/openmp-offload-gpu.c +++ b/clang/test/Driver/openmp-offload-gpu.c @@ -353,3 +353,10 @@ // NEW_DRIVER: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[DEVICE_ASM]]"], output: "[[DEVICE_OBJ:.+]]" // NEW_DRIVER: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_OBJ]]"], output: "[[HOST_OBJ:.+]]" // NEW_DRIVER: "x86_64-unknown-linux-gnu" - "[[LINKER:.+]]", inputs: ["[[HOST_OBJ]]"], output: "openmp-offload-gpu" + +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvida-cuda -march=sm_70 \ +// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-new-nvptx-test.bc \ +// RUN: -fopenmp-new-driver -no-canonical-prefixes %s -o openmp-offload-gpu 2>&1 \ +// RUN: | FileCheck -check-prefix=NEW_DRIVER_EMBEDDING %s + +// NEW_DRIVER_EMBEDDING: -fembed-offload-object=[[CUBIN:.*\.cubin]],nvptx64-nvidia-cuda.sm_70 diff --git a/clang/test/Frontend/embed-object.ll b/clang/test/Frontend/embed-object.ll --- a/clang/test/Frontend/embed-object.ll +++ b/clang/test/Frontend/embed-object.ll @@ -1,9 +1,11 @@ ; RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \ -; RUN: -fembed-offload-object=%S/Inputs/empty.h,section -x ir %s -o - \ +; RUN: -fembed-offload-object=%S/Inputs/empty.h,section1 \ +; RUN: -fembed-offload-object=%S/Inputs/empty.h,section2 -x ir %s -o - \ ; RUN: | FileCheck %s -check-prefix=CHECK -; CHECK: @llvm.embedded.object = private constant [0 x i8] zeroinitializer, section ".llvm.offloading.section" -; CHECK: @llvm.compiler.used = appending global [2 x i8*] [i8* @x, i8* getelementptr inbounds ([0 x i8], [0 x i8]* @llvm.embedded.object, i32 0, i32 0)], section "llvm.metadata" +; CHECK: @[[OBJECT1:.+]] = private constant [0 x i8] zeroinitializer, section ".llvm.offloading.section1" +; CHECK: @[[OBJECT2:.+]] = private constant [0 x i8] zeroinitializer, section ".llvm.offloading.section2" +; CHECK: @llvm.compiler.used = appending global [3 x i8*] [i8* @x, i8* getelementptr inbounds ([0 x i8], [0 x i8]* @[[OBJECT1]], i32 0, i32 0), i8* getelementptr inbounds ([0 x i8], [0 x i8]* @[[OBJECT2]], i32 0, i32 0)], section "llvm.metadata" @x = private constant i8 1 @llvm.compiler.used = appending global [1 x i8*] [i8* @x], section "llvm.metadata"