diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -266,6 +266,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP") LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP") LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP") +LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2526,9 +2526,9 @@ PosFlag, NegFlag, BothFlags<[NoArgumentUnused, HelpHidden]>>; def static_openmp: Flag<["-"], "static-openmp">, HelpText<"Use the static host OpenMP runtime while linking.">; -def offload_new_driver : Flag<["--"], "offload-new-driver">, Flags<[CC1Option]>, Group, - HelpText<"Use the new driver for offloading compilation.">; -def no_offload_new_driver : Flag<["--"], "no-offload-new-driver">, Flags<[CC1Option]>, Group, +def offload_new_driver : Flag<["--"], "offload-new-driver">, Flags<[CC1Option]>, Group, + MarshallingInfoFlag>, HelpText<"Use the new driver for offloading compilation.">; +def no_offload_new_driver : Flag<["--"], "no-offload-new-driver">, Flags<[CC1Option]>, Group, HelpText<"Don't Use the new driver for offloading compilation.">; def offload_device_only : Flag<["--"], "offload-device-only">, HelpText<"Only compile for the offloading device.">; @@ -2543,7 +2543,7 @@ def cuda_compile_host_device : Flag<["--"], "cuda-compile-host-device">, Alias, HelpText<"Compile CUDA code for both host and device (default). Has no " "effect on non-CUDA compilations.">; -def fopenmp_new_driver : Flag<["-"], "fopenmp-new-driver">, Flags<[CC1Option]>, Group, +def fopenmp_new_driver : Flag<["-"], "fopenmp-new-driver">, Flags<[CC1Option]>, Group, HelpText<"Use the new driver for OpenMP offloading.">; def fno_openmp_new_driver : Flag<["-"], "fno-openmp-new-driver">, Flags<[CC1Option]>, Group, Alias, HelpText<"Don't use the new driver for OpenMP offloading.">; diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -157,6 +157,8 @@ llvm::Function *makeModuleDtorFunction(); /// Transform managed variables for device compilation. void transformManagedVars(); + /// Create offloading entries to register globals in RDC mode. + void createOffloadingEntries(); public: CGNVCUDARuntime(CodeGenModule &CGM); @@ -210,7 +212,8 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), TheModule(CGM.getModule()), - RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), + RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode || + CGM.getLangOpts().OffloadingNewDriver), DeviceMC(InitDeviceMC(CGM)) { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); @@ -1107,6 +1110,40 @@ } } +// Creates offloading entries for all the kernels and globals that must be +// registered. The linker will provide a pointer to this section so we can +// register the symbols with the linked device image. +void CGNVCUDARuntime::createOffloadingEntries() { + llvm::OpenMPIRBuilder OMPBuilder(CGM.getModule()); + OMPBuilder.initialize(); + + StringRef Section = "cuda_offloading_entries"; + for (KernelInfo &I : EmittedKernels) + OMPBuilder.emitOffloadingEntry( + KernelHandles[I.Kernel], getDeviceSideName(cast(I.D)), 0, + DeviceVarFlags::OffloadRegionKernelEntry, Section); + + for (VarInfo &I : DeviceVars) { + uint64_t VarSize = + CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType()); + if (I.Flags.getKind() == DeviceVarFlags::Variable) { + OMPBuilder.emitOffloadingEntry( + I.Var, getDeviceSideName(I.D), VarSize, + I.Flags.isManaged() ? DeviceVarFlags::OffloadGlobalManagedEntry + : DeviceVarFlags::OffloadGlobalVarEntry, + Section); + } else if (I.Flags.getKind() == DeviceVarFlags::Surface) { + OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize, + DeviceVarFlags::OffloadGlobalSurfaceEntry, + Section); + } else if (I.Flags.getKind() == DeviceVarFlags::Texture) { + OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize, + DeviceVarFlags::OffloadGlobalTextureEntry, + Section); + } + } +} + // Returns module constructor to be added. llvm::Function *CGNVCUDARuntime::finalizeModule() { if (CGM.getLangOpts().CUDAIsDevice) { @@ -1135,7 +1172,11 @@ } return nullptr; } - return makeModuleCtorFunction(); + if (!(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)) + return makeModuleCtorFunction(); + + createOffloadingEntries(); + return nullptr; } llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F, diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h --- a/clang/lib/CodeGen/CGCUDARuntime.h +++ b/clang/lib/CodeGen/CGCUDARuntime.h @@ -52,6 +52,24 @@ Texture, // Builtin texture }; + /// The kind flag of the target region entry. + enum OffloadRegionEntryKindFlag : uint32_t { + /// Mark the region entry as a kernel. + OffloadRegionKernelEntry = 0x0, + }; + + /// The kind flag of the global variable entry. + enum OffloadVarEntryKindFlag : uint32_t { + /// Mark the entry as a global variable. + OffloadGlobalVarEntry = 0x0, + /// Mark the entry as a managed global variable. + OffloadGlobalManagedEntry = 0x1, + /// Mark the entry as a surface variable. + OffloadGlobalSurfaceEntry = 0x2, + /// Mark the entry as a texture variable. + OffloadGlobalTextureEntry = 0x3, + }; + private: unsigned Kind : 2; unsigned Extern : 1; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6082,6 +6082,10 @@ options::OPT_fno_openmp_extensions); } + // Forward the new driver to change offloading code generation. + if (Args.hasArg(options::OPT_offload_new_driver)) + CmdArgs.push_back("--offload-new-driver"); + SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType); const XRayArgs &XRay = TC.getXRayArgs(); diff --git a/clang/test/CodeGenCUDA/offloading-entries.cu b/clang/test/CodeGenCUDA/offloading-entries.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/offloading-entries.cu @@ -0,0 +1,33 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu \ +// RUN: --offload-new-driver -emit-llvm -o - -x cuda %s | FileCheck \ +// RUN: --check-prefix=HOST %s + +#include "Inputs/cuda.h" + +//. +// HOST: @x = internal global i32 undef, align 4 +// HOST: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00" +// HOST: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1 +// HOST: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00" +// HOST: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1 +// HOST: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00" +// HOST: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1 +//. +// HOST-LABEL: @_Z18__device_stub__foov( +// HOST-NEXT: entry: +// HOST-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__foov) +// HOST-NEXT: br label [[SETUP_END:%.*]] +// HOST: setup.end: +// HOST-NEXT: ret void +// +__global__ void foo() {} +// HOST-LABEL: @_Z18__device_stub__barv( +// HOST-NEXT: entry: +// HOST-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv) +// HOST-NEXT: br label [[SETUP_END:%.*]] +// HOST: setup.end: +// HOST-NEXT: ret void +// +__global__ void bar() {} +__device__ int x = 1;