diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -266,6 +266,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP") LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP") LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP") +LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2519,11 +2519,11 @@ PosFlag, NegFlag, BothFlags<[NoArgumentUnused, HelpHidden]>>; def static_openmp: Flag<["-"], "static-openmp">, HelpText<"Use the static host OpenMP runtime while linking.">; -def fopenmp_new_driver : Flag<["-"], "fopenmp-new-driver">, Flags<[CC1Option]>, Group, +def fopenmp_new_driver : Flag<["-"], "fopenmp-new-driver">, Flags<[CC1Option]>, Group, HelpText<"Use the new driver for OpenMP offloading.">; -def fno_openmp_new_driver : Flag<["-"], "fno-openmp-new-driver">, Flags<[CC1Option]>, Group, +def fno_openmp_new_driver : Flag<["-"], "fno-openmp-new-driver">, Flags<[CC1Option]>, Group, HelpText<"Don't use the new driver for OpenMP offloading.">; -def foffload_new_driver : Flag<["-"], "foffload-new-driver">, Flags<[CC1Option]>, Group, +def foffload_new_driver : Flag<["-"], "foffload-new-driver">, Flags<[CC1Option]>, Group, HelpText<"Use the new driver for offloading.">; def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group, Flags<[CC1Option]>, HelpText<"Disable tail call optimization, keeping the call stack accurate">, diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -157,6 +157,8 @@ llvm::Function *makeModuleDtorFunction(); /// Transform managed variables for device compilation. void transformManagedVars(); + /// Create offloading entries to register globals in RDC mode. + void createOffloadingEntries(); public: CGNVCUDARuntime(CodeGenModule &CGM); @@ -210,7 +212,8 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), TheModule(CGM.getModule()), - RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), + RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode || + CGM.getLangOpts().OffloadingNewDriver), DeviceMC(InitDeviceMC(CGM)) { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); @@ -1107,6 +1110,40 @@ } } +// Creates offloading entries for all the kernels and globals that must be +// registered. The linker will provide a pointer to this section so we can +// register the symbols with the linked device image. +void CGNVCUDARuntime::createOffloadingEntries() { + llvm::OpenMPIRBuilder OMPBuilder(CGM.getModule()); + OMPBuilder.initialize(); + + StringRef Section = "cuda_offloading_entries"; + for (KernelInfo &I : EmittedKernels) + OMPBuilder.emitOffloadingEntry( + KernelHandles[I.Kernel], getDeviceSideName(cast(I.D)), 0, + DeviceVarFlags::OffloadRegionKernelEntry, Section); + + for (VarInfo &I : DeviceVars) { + uint64_t VarSize = + CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType()); + if (I.Flags.getKind() == DeviceVarFlags::Variable) { + OMPBuilder.emitOffloadingEntry( + I.Var, getDeviceSideName(I.D), VarSize, + I.Flags.isManaged() ? DeviceVarFlags::OffloadGlobalManagedEntry + : DeviceVarFlags::OffloadGlobalVarEntry, + Section); + } else if (I.Flags.getKind() == DeviceVarFlags::Surface) { + OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize, + DeviceVarFlags::OffloadGlobalSurfaceEntry, + Section); + } else if (I.Flags.getKind() == DeviceVarFlags::Texture) { + OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize, + DeviceVarFlags::OffloadGlobalTextureEntry, + Section); + } + } +} + // Returns module constructor to be added. llvm::Function *CGNVCUDARuntime::finalizeModule() { if (CGM.getLangOpts().CUDAIsDevice) { @@ -1135,7 +1172,11 @@ } return nullptr; } - return makeModuleCtorFunction(); + if (!(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)) + return makeModuleCtorFunction(); + + createOffloadingEntries(); + return nullptr; } llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F, diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h --- a/clang/lib/CodeGen/CGCUDARuntime.h +++ b/clang/lib/CodeGen/CGCUDARuntime.h @@ -52,6 +52,24 @@ Texture, // Builtin texture }; + /// The kind flag of the target region entry. + enum OffloadRegionEntryKindFlag : uint32_t { + /// Mark the region entry as a kernel. + OffloadRegionKernelEntry = 0x0, + }; + + /// The kind flag of the global variable entry. + enum OffloadVarEntryKindFlag : uint32_t { + /// Mark the entry as a global variable. + OffloadGlobalVarEntry = 0x0, + /// Mark the entry as a managed global variable. + OffloadGlobalManagedEntry = 0x1, + /// Mark the entry as a surface variable. + OffloadGlobalSurfaceEntry = 0x2, + /// Mark the entry as a texture variable. + OffloadGlobalTextureEntry = 0x4, + }; + private: unsigned Kind : 2; unsigned Extern : 1; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6053,6 +6053,10 @@ options::OPT_fno_openmp_extensions); } + // Forward the new driver to change offloading code generation. + if (Args.hasArg(options::OPT_foffload_new_driver)) + CmdArgs.push_back("-foffload-new-driver"); + SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType); const XRayArgs &XRay = TC.getXRayArgs(); diff --git a/clang/test/CodeGenCUDA/offloading-entries.cu b/clang/test/CodeGenCUDA/offloading-entries.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/offloading-entries.cu @@ -0,0 +1,39 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu \ +// RUN: -foffload-new-driver -emit-llvm -o - -x cuda %s | FileCheck \ +// RUN: --check-prefix=HOST %s + +#include "Inputs/cuda.h" + +//. +// HOST: @x = internal global i32 undef, align 4 +// HOST: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00" +// HOST: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1 +// HOST: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00" +// HOST: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1 +// HOST: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00" +// HOST: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1 +//. +// HOST-LABEL: @_Z18__device_stub__foov( +// HOST-NEXT: entry: +// HOST-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__foov) +// HOST-NEXT: br label [[SETUP_END:%.*]] +// HOST: setup.end: +// HOST-NEXT: ret void +// +__global__ void foo() {} +// HOST-LABEL: @_Z18__device_stub__barv( +// HOST-NEXT: entry: +// HOST-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv) +// HOST-NEXT: br label [[SETUP_END:%.*]] +// HOST: setup.end: +// HOST-NEXT: ret void +// +__global__ void bar() {} +__device__ int x = 1; +//. +// HOST: attributes #0 = { noinline norecurse nounwind optnone "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +//. +// HOST: !0 = !{i32 1, !"wchar_size", i32 4} +// HOST: !1 = !{!"clang version 15.0.0"} +//. diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -150,6 +150,10 @@ /// section will contain one or more offloading binaries stored contiguously. #define OFFLOAD_SECTION_MAGIC_STR ".llvm.offloading" +/// The magic offset for the first object inside CUDA's fatbinary. This can be +/// different but it should work for what is passed here. +static constexpr unsigned FatbinaryOffset = 0x50; + /// Information for a device offloading file extracted from the host. struct DeviceFile { DeviceFile(StringRef Kind, StringRef TheTriple, StringRef Arch, @@ -163,7 +167,10 @@ }; namespace llvm { -/// Helper that allows DeviceFile to be used as a key in a DenseMap. +/// Helper that allows DeviceFile to be used as a key in a DenseMap. For now we +/// assume device files with matching architectures and triples but different +/// offloading kinds should be handlded together, this may not be true in the +/// future. template <> struct DenseMapInfo { static DeviceFile getEmptyKey() { return {DenseMapInfo::getEmptyKey(), @@ -934,13 +941,37 @@ MemoryBuffer::getFileOrSTDIN(File); if (std::error_code EC = BufferOrErr.getError()) return createFileError(File, EC); + MemoryBufferRef Buffer = **BufferOrErr; file_magic Type = identify_magic((*BufferOrErr)->getBuffer()); - if (Type != file_magic::bitcode) { + switch (Type) { + case file_magic::bitcode: { + Expected> InputFileOrErr = + llvm::lto::InputFile::create(Buffer); + if (!InputFileOrErr) + return InputFileOrErr.takeError(); + + // Save the input file and the buffer associated with its memory. + BitcodeFiles.push_back(std::move(*InputFileOrErr)); + SavedBuffers.push_back(std::move(*BufferOrErr)); + continue; + } + case file_magic::cuda_fatbinary: { + // Cuda fatbinaries made by Clang almost almost have an object eighty + // bytes from the beginning. This should be sufficient to identify the + // symbols. + Buffer = MemoryBufferRef( + (*BufferOrErr)->getBuffer().drop_front(FatbinaryOffset), "FatBinary"); + LLVM_FALLTHROUGH; + } + case file_magic::elf_relocatable: + case file_magic::elf_shared_object: + case file_magic::macho_object: + case file_magic::coff_object: { Expected> ObjFile = - ObjectFile::createObjectFile(**BufferOrErr, Type); + ObjectFile::createObjectFile(Buffer); if (!ObjFile) - return ObjFile.takeError(); + continue; NewInputFiles.push_back(File.str()); for (auto &Sym : (*ObjFile)->symbols()) { @@ -954,15 +985,10 @@ else UsedInSharedLib.insert(Saver.save(*Name)); } - } else { - Expected> InputFileOrErr = - llvm::lto::InputFile::create(**BufferOrErr); - if (!InputFileOrErr) - return InputFileOrErr.takeError(); - - // Save the input file and the buffer associated with its memory. - BitcodeFiles.push_back(std::move(*InputFileOrErr)); - SavedBuffers.push_back(std::move(*BufferOrErr)); + continue; + } + default: + continue; } }