Index: include/clang/Basic/LangOptions.def =================================================================== --- include/clang/Basic/LangOptions.def +++ include/clang/Basic/LangOptions.def @@ -211,7 +211,7 @@ LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code") LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__") LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") -LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code") +LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code") LANGOPT(SizedDeallocation , 1, 0, "sized deallocation") LANGOPT(AlignedAllocation , 1, 0, "aligned allocation") Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -584,9 +584,11 @@ def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">, Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">; def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">; -def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option]>, +def fgpu_rdc : Flag<["-"], "fgpu-rdc">, Flags<[CC1Option]>, HelpText<"Generate relocatable device code, also known as separate compilation mode.">; -def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">; +def fno_gpu_rdc : Flag<["-"], "fno-gpu-rdc">; +def : Flag<["-"], "fcuda-rdc">, Alias; +def : Flag<["-"], "fno-cuda-rdc">, Alias; def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>, HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">; def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">; Index: include/clang/Driver/Types.def =================================================================== --- include/clang/Driver/Types.def +++ include/clang/Driver/Types.def @@ -101,4 +101,5 @@ TYPE("dSYM", dSYM, INVALID, "dSYM", "A") TYPE("dependencies", Dependencies, INVALID, "d", "") TYPE("cuda-fatbin", CUDA_FATBIN, INVALID, "fatbin","A") +TYPE("hip-fatbin", HIP_FATBIN, INVALID, "hipfb", "A") TYPE("none", Nothing, INVALID, nullptr, "u") Index: lib/AST/Decl.cpp =================================================================== --- lib/AST/Decl.cpp +++ lib/AST/Decl.cpp @@ -2451,7 +2451,7 @@ // // With CUDA relocatable device code enabled, these variables don't get // special handling; they're treated like regular extern variables. - if (LangOpts.CUDA && !LangOpts.CUDARelocatableDeviceCode && + if (LangOpts.CUDA && !LangOpts.GPURelocatableDeviceCode && hasExternalStorage() && hasAttr() && isa(getType())) return true; Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -137,7 +137,7 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), TheModule(CGM.getModule()), - RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) { + RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode) { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); @@ -353,8 +353,8 @@ // global variable and save a reference in GpuBinaryHandle to be cleaned up // in destructor on exit. Then associate all known kernels with the GPU binary // handle so CUDA runtime can figure out what to call on the GPU side. - std::unique_ptr CudaGpuBinary; - if (!IsHIP) { + std::unique_ptr CudaGpuBinary = nullptr; + if (!CudaGpuBinaryFileName.empty()) { llvm::ErrorOr> CudaGpuBinaryOrErr = llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { @@ -388,15 +388,23 @@ ModuleIDSectionName = "__hip_module_id"; ModuleIDPrefix = "__hip_"; - // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin. - // The external symbol is supposed to contain the fat binary but will be - // populated somewhere else, e.g. by lld through link script. - FatBinStr = new llvm::GlobalVariable( + if (CudaGpuBinary) { + // If fatbin is available from early finalization, create a string + // literal containing the fat binary loaded from the given file. + FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", + FatbinConstantName, 8); + } else { + // If fatbin is not available, create an external symbol + // __hip_fatbin in section .hip_fatbin. The external symbol is supposed + // to contain the fat binary but will be populated somewhere else, + // e.g. by lld through link script. + FatBinStr = new llvm::GlobalVariable( CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, "__hip_fatbin", nullptr, llvm::GlobalVariable::NotThreadLocal); - cast(FatBinStr)->setSection(FatbinConstantName); + cast(FatBinStr)->setSection(FatbinConstantName); + } FatMagic = HIPFatMagic; } else { @@ -447,6 +455,8 @@ // thread safety of the loaded program. Therefore we can assume sequential // execution of constructor functions here. if (IsHIP) { + auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : + llvm::GlobalValue::LinkOnceAnyLinkage; llvm::BasicBlock *IfBlock = llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); llvm::BasicBlock *ExitBlock = @@ -455,12 +465,13 @@ // of HIP ABI. GpuBinaryHandle = new llvm::GlobalVariable( TheModule, VoidPtrPtrTy, /*isConstant=*/false, - llvm::GlobalValue::LinkOnceAnyLinkage, + Linkage, /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__hip_gpubin_handle"); GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); // Prevent the weak symbol in different shared libraries being merged. - GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); + if (Linkage != llvm::GlobalValue::InternalLinkage) + GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); Address GpuBinaryAddr( GpuBinaryHandle, CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -2486,11 +2486,13 @@ class HIPActionBuilder final : public CudaActionBuilderBase { /// The linker inputs obtained for each device arch. SmallVector DeviceLinkerInputs; + bool Relocatable; public: HIPActionBuilder(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs) - : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {} + : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP), + Relocatable(false) {} bool canUseBundlerUnbundler() const override { return true; } @@ -2499,23 +2501,68 @@ phases::ID CurPhase, phases::ID FinalPhase, PhasesTy &Phases) override { // amdgcn does not support linking of object files, therefore we skip - // backend and assemble phases to output LLVM IR. - if (CudaDeviceActions.empty() || CurPhase == phases::Backend || + // backend and assemble phases to output LLVM IR. Except for generating + // non-relocatable device coee, where we generate fat binary for device + // code and pass to host in Backend phase. + if (CudaDeviceActions.empty() || + (CurPhase == phases::Backend && Relocatable) || CurPhase == phases::Assemble) return ABRT_Success; - assert((CurPhase == phases::Link || + assert(((CurPhase == phases::Link && Relocatable) || CudaDeviceActions.size() == GpuArchList.size()) && "Expecting one action per GPU architecture."); assert(!CompileHostOnly && "Not expecting CUDA actions in host-only compilation."); - // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch. - // This happens to each device action originated from each input file. - // Later on, device actions in DeviceLinkerInputs are used to create - // device link actions in appendLinkDependences and the created device - // link actions are passed to the offload action as device dependence. - if (CurPhase == phases::Link) { + if (!Relocatable && CurPhase == phases::Backend) { + // If we are in backend phase, we attempt to generate the fat binary. + // We compile each arch to IR and use a link action to generate code + // object containing ISA. Then we use a special "link" action to create + // a fat binary containing all the code objects for different GPU's. + // The fat binary is then an input to the host action. + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + // Create a link action to link device IR with device library + // and generate ISA. + ActionList AL; + AL.push_back(CudaDeviceActions[I]); + CudaDeviceActions[I] = + C.MakeAction(AL, types::TY_Image); + + // OffloadingActionBuilder propagates device arch until an offload + // action. Since the next action for creating fatbin does + // not have device arch, whereas the above link action and its input + // have device arch, an offload action is needed to stop the null + // device arch of the next action being propagated to the above link + // action. + OffloadAction::DeviceDependences DDep; + DDep.add(*CudaDeviceActions[I], *ToolChains.front(), + CudaArchToString(GpuArchList[I]), AssociatedOffloadKind); + CudaDeviceActions[I] = C.MakeAction( + DDep, CudaDeviceActions[I]->getType()); + } + // Create HIP fat binary with a special "link" action. + CudaFatBinary = + C.MakeAction(CudaDeviceActions, + types::TY_HIP_FATBIN); + + DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr, + AssociatedOffloadKind); + // Clear the fat binary, it is already a dependence to an host + // action. + CudaFatBinary = nullptr; + + // Remove the CUDA actions as they are already connected to an host + // action or fat binary. + CudaDeviceActions.clear(); + + return ABRT_Success; + } else if (CurPhase == phases::Link) { + // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch. + // This happens to each device action originated from each input file. + // Later on, device actions in DeviceLinkerInputs are used to create + // device link actions in appendLinkDependences and the created device + // link actions are passed to the offload action as device dependence. DeviceLinkerInputs.resize(CudaDeviceActions.size()); auto LI = DeviceLinkerInputs.begin(); for (auto *A : CudaDeviceActions) { @@ -2548,6 +2595,13 @@ ++I; } } + + bool initialize() override { + Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, + options::OPT_fno_gpu_rdc, /*Default=*/false); + + return CudaActionBuilderBase::initialize(); + } }; /// OpenMP action builder. The host bitcode is passed to the device frontend Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -4920,16 +4920,16 @@ CmdArgs.push_back(Args.MakeArgString(Flags)); } - if (IsCuda) { - // Host-side cuda compilation receives all device-side outputs in a single - // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary. - if (CudaDeviceInput) { + // Host-side cuda compilation receives all device-side outputs in a single + // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary. + if ((IsCuda || IsHIP) && CudaDeviceInput) { CmdArgs.push_back("-fcuda-include-gpubinary"); CmdArgs.push_back(CudaDeviceInput->getFilename()); - } + if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) + CmdArgs.push_back("-fgpu-rdc"); + } - if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false)) - CmdArgs.push_back("-fcuda-rdc"); + if (IsCuda) { if (Args.hasFlag(options::OPT_fcuda_short_ptr, options::OPT_fno_cuda_short_ptr, false)) CmdArgs.push_back("-fcuda-short-ptr"); Index: lib/Driver/ToolChains/CommonArgs.cpp =================================================================== --- lib/Driver/ToolChains/CommonArgs.cpp +++ lib/Driver/ToolChains/CommonArgs.cpp @@ -15,6 +15,7 @@ #include "Arch/SystemZ.h" #include "Arch/X86.h" #include "Hexagon.h" +#include "HIP.h" #include "InputInfo.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/LangOptions.h" @@ -1337,6 +1338,18 @@ if (!JA.isHostOffloading(Action::OFK_HIP)) return; + InputInfoList DeviceInputs; + for (const auto &II : Inputs) { + const Action *A = II.getAction(); + // Is this a device linking action? + if (A && isa(A) && A->isDeviceOffloading(Action::OFK_HIP)) { + DeviceInputs.push_back(II); + } + } + + if (DeviceInputs.empty()) + return; + // Create temporary linker script. Keep it if save-temps is enabled. const char *LKS; SmallString<256> Name = llvm::sys::path::filename(Output.getFilename()); @@ -1364,39 +1377,12 @@ "Wrong platform"); (void)HIPTC; - // Construct clang-offload-bundler command to bundle object files for - // for different GPU archs. - ArgStringList BundlerArgs; - BundlerArgs.push_back(Args.MakeArgString("-type=o")); - - // ToDo: Remove the dummy host binary entry which is required by - // clang-offload-bundler. - std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux"; - std::string BundlerInputArg = "-inputs=/dev/null"; - - for (const auto &II : Inputs) { - const Action *A = II.getAction(); - // Is this a device linking action? - if (A && isa(A) && A->isDeviceOffloading(Action::OFK_HIP)) { - BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" + - StringRef(A->getOffloadingArch()).str(); - BundlerInputArg = BundlerInputArg + "," + II.getFilename(); - } - } - BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg)); - BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg)); - - std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o"); + // The output file name needs to persist through the compilation, therefore + // it needs to be created through MakeArgString. + std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "hipfb"); const char *BundleFile = C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str())); - auto BundlerOutputArg = - Args.MakeArgString(std::string("-outputs=").append(BundleFile)); - BundlerArgs.push_back(BundlerOutputArg); - - SmallString<128> BundlerPath(C.getDriver().Dir); - llvm::sys::path::append(BundlerPath, "clang-offload-bundler"); - const char *Bundler = Args.MakeArgString(BundlerPath); - C.addCommand(llvm::make_unique(JA, T, Bundler, BundlerArgs, Inputs)); + AMDGCN::constructHIPFatbinCommand(C, JA, BundleFile, DeviceInputs, Args, T); // Add commands to embed target binaries. We ensure that each section and // image is 16-byte aligned. This is not mandatory, but increases the Index: lib/Driver/ToolChains/Cuda.cpp =================================================================== --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -398,8 +398,8 @@ options::OPT_fnoopenmp_relocatable_target, /*Default=*/true); else if (JA.isOffloading(Action::OFK_Cuda)) - Relocatable = Args.hasFlag(options::OPT_fcuda_rdc, - options::OPT_fno_cuda_rdc, /*Default=*/false); + Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, + options::OPT_fno_gpu_rdc, /*Default=*/false); if (Relocatable) CmdArgs.push_back("-c"); @@ -609,9 +609,9 @@ options::OPT_fno_cuda_approx_transcendentals, false)) CC1Args.push_back("-fcuda-approx-transcendentals"); - if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, + if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) - CC1Args.push_back("-fcuda-rdc"); + CC1Args.push_back("-fgpu-rdc"); } if (DriverArgs.hasArg(options::OPT_nocudalib)) Index: lib/Driver/ToolChains/HIP.h =================================================================== --- lib/Driver/ToolChains/HIP.h +++ lib/Driver/ToolChains/HIP.h @@ -19,6 +19,11 @@ namespace tools { namespace AMDGCN { + // Construct command for creating HIP fatbin. + void constructHIPFatbinCommand(Compilation &C, const JobAction &JA, + StringRef OutputFileName, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, const Tool& T); + // Runs llvm-link/opt/llc/lld, which links multiple LLVM bitcode, together with // device library, then compiles it to ISA in a shared object. class LLVM_LIBRARY_VISIBILITY Linker : public Tool { Index: lib/Driver/ToolChains/HIP.cpp =================================================================== --- lib/Driver/ToolChains/HIP.cpp +++ lib/Driver/ToolChains/HIP.cpp @@ -184,6 +184,40 @@ C.addCommand(llvm::make_unique(JA, *this, Lld, LldArgs, Inputs)); } +// Construct a clang-offload-bundler command to bundle code objects for +// different GPU's into a HIP fat binary. +void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA, + StringRef OutputFileName, const InputInfoList &Inputs, + const llvm::opt::ArgList &Args, const Tool& T) { + // Construct clang-offload-bundler command to bundle object files for + // for different GPU archs. + ArgStringList BundlerArgs; + BundlerArgs.push_back(Args.MakeArgString("-type=o")); + + // ToDo: Remove the dummy host binary entry which is required by + // clang-offload-bundler. + std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux"; + std::string BundlerInputArg = "-inputs=/dev/null"; + + for (const auto &II : Inputs) { + const auto* A = II.getAction(); + BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" + + StringRef(A->getOffloadingArch()).str(); + BundlerInputArg = BundlerInputArg + "," + II.getFilename(); + } + BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg)); + BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg)); + + auto BundlerOutputArg = + Args.MakeArgString(std::string("-outputs=").append(OutputFileName)); + BundlerArgs.push_back(BundlerOutputArg); + + SmallString<128> BundlerPath(C.getDriver().Dir); + llvm::sys::path::append(BundlerPath, "clang-offload-bundler"); + const char *Bundler = Args.MakeArgString(BundlerPath); + C.addCommand(llvm::make_unique(JA, T, Bundler, BundlerArgs, Inputs)); +} + // For amdgcn the inputs of the linker job are device bitcode and output is // object file. It calls llvm-link, opt, llc, then lld steps. void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA, @@ -192,6 +226,9 @@ const ArgList &Args, const char *LinkingOutput) const { + if (JA.getType() == types::TY_HIP_FATBIN) + return constructHIPFatbinCommand(C, JA, Output.getFilename(), Inputs, Args, *this); + assert(getToolChain().getTriple().getArch() == llvm::Triple::amdgcn && "Unsupported target"); @@ -244,9 +281,9 @@ options::OPT_fno_cuda_approx_transcendentals, false)) CC1Args.push_back("-fcuda-approx-transcendentals"); - if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, + if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) - CC1Args.push_back("-fcuda-rdc"); + CC1Args.push_back("-fgpu-rdc"); // Default to "hidden" visibility, as object level linking will not be // supported for the foreseeable future. Index: lib/Frontend/CompilerInvocation.cpp =================================================================== --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -2220,7 +2220,7 @@ if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals)) Opts.CUDADeviceApproxTranscendentals = 1; - Opts.CUDARelocatableDeviceCode = Args.hasArg(OPT_fcuda_rdc); + Opts.GPURelocatableDeviceCode = Args.hasArg(OPT_fgpu_rdc); if (Opts.ObjC1) { if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) { Index: lib/Sema/SemaDeclAttr.cpp =================================================================== --- lib/Sema/SemaDeclAttr.cpp +++ lib/Sema/SemaDeclAttr.cpp @@ -4143,7 +4143,7 @@ const auto *VD = cast(D); // extern __shared__ is only allowed on arrays with no length (e.g. // "int x[]"). - if (!S.getLangOpts().CUDARelocatableDeviceCode && VD->hasExternalStorage() && + if (!S.getLangOpts().GPURelocatableDeviceCode && VD->hasExternalStorage() && !isa(VD->getType())) { S.Diag(AL.getLoc(), diag::err_cuda_extern_shared) << VD; return; Index: test/CodeGenCUDA/device-stub.cu =================================================================== --- test/CodeGenCUDA/device-stub.cu +++ test/CodeGenCUDA/device-stub.cu @@ -6,22 +6,22 @@ // RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS \ // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ -// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - \ +// RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - \ // RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,CUDA,CUDARDC // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \ // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-include-gpubinary %t -o - -x hip\ -// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP +// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP,HIPEF // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS -x hip \ // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ -// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \ -// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP +// RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \ +// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP,HIPEF // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\ -// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN +// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,NORDC,HIP,HIPNEF #include "Inputs/cuda.h" @@ -64,8 +64,9 @@ // * constant unnamed string with the kernel name // ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00" // * constant unnamed string with GPU binary -// HIP: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin" // CUDA: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00", +// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00", +// HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin" // CUDANORDC-SAME: section ".nv_fatbin", align 8 // CUDARDC-SAME: section "__nv_relfatbin", align 8 // * constant struct that wraps GPU binary @@ -74,13 +75,14 @@ // CUDA-SAME: { i32 1180844977, i32 1, // HIP-SAME: { i32 1212764230, i32 1, // CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0), -// HIP-SAME: i8* @[[FATBIN]], +// HIPEF-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0), +// HIPNEF-SAME: i8* @[[FATBIN]], // ALL-SAME: i8* null } // CUDA-SAME: section ".nvFatBinSegment" // HIP-SAME: section ".hipFatBinSegment" // * variable to save GPU binary handle after initialization // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null -// HIP: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null +// HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null // * constant unnamed string with NVModuleID // RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32 @@ -157,7 +159,7 @@ // device-side globals, but we still need to register GPU binary. // Skip GPU binary string first. // CUDANOGLOBALS: @{{.*}} = private constant{{.*}} -// HIPNOGLOBALS: @{{.*}} = external constant{{.*}} +// HIPNOGLOBALS: @{{.*}} = internal constant{{.*}} // NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals // NOGLOBALS: define internal void @__[[PREFIX:cuda|hip]]_module_ctor // NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper Index: test/Driver/cuda-external-tools.cu =================================================================== --- test/Driver/cuda-external-tools.cu +++ test/Driver/cuda-external-tools.cu @@ -19,7 +19,7 @@ // RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s // Generating relocatable device code -// RUN: %clang -### -target x86_64-linux-gnu -fcuda-rdc -c %s 2>&1 \ +// RUN: %clang -### -target x86_64-linux-gnu -fgpu-rdc -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s // With debugging enabled, ptxas should be run with with no ptxas optimizations. @@ -46,21 +46,21 @@ // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s // Separate compilation targeting sm_35. -// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \ +// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s // 32-bit compile. // RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s // 32-bit compile when generating relocatable device code. -// RUN: %clang -### -target i386-linux-gnu -fcuda-rdc -c %s 2>&1 \ +// RUN: %clang -### -target i386-linux-gnu -fgpu-rdc -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s // Compile with -fintegrated-as. This should still cause us to invoke ptxas. // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s // Check that we still pass -c when generating relocatable device code. -// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fcuda-rdc -c %s 2>&1 \ +// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fgpu-rdc -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s // Check -Xcuda-ptxas and -Xcuda-fatbinary @@ -77,11 +77,11 @@ // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s // Check relocatable device code generation on MacOS. -// RUN: %clang -### -target x86_64-apple-macosx -O0 -fcuda-rdc -c %s 2>&1 \ +// RUN: %clang -### -target x86_64-apple-macosx -O0 -fgpu-rdc -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s -// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \ +// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s -// RUN: %clang -### -target i386-apple-macosx -fcuda-rdc -c %s 2>&1 \ +// RUN: %clang -### -target i386-apple-macosx -fgpu-rdc -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s // Check that CLANG forwards the -v flag to PTXAS. @@ -92,12 +92,12 @@ // CHECK: "-cc1" // ARCH64-SAME: "-triple" "nvptx64-nvidia-cuda" // ARCH32-SAME: "-triple" "nvptx-nvidia-cuda" +// RDC-SAME: "-fgpu-rdc" +// CHECK-NOT: "-fgpu-rdc" // SM20-SAME: "-target-cpu" "sm_20" // SM35-SAME: "-target-cpu" "sm_35" // SM20-SAME: "-o" "[[PTXFILE:[^"]*]]" // SM35-SAME: "-o" "[[PTXFILE:[^"]*]]" -// RDC-SAME: "-fcuda-rdc" -// CHECK-NOT: "-fcuda-rdc" // Match the call to ptxas (which assembles PTX to SASS). // CHECK: ptxas @@ -141,7 +141,7 @@ // ARCH64-SAME: "-triple" "x86_64- // ARCH32-SAME: "-triple" "i386- // CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]" -// RDC-SAME: "-fcuda-rdc" -// CHECK-NOT: "-fcuda-rdc" +// RDC-SAME: "-fgpu-rdc" +// CHECK-NOT: "-fgpu-rdc" // CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v" Index: test/Driver/cuda-phases.cu =================================================================== --- test/Driver/cuda-phases.cu +++ test/Driver/cuda-phases.cu @@ -11,12 +11,21 @@ // // Test single gpu architecture with complete compilation. // +// Test CUDA NVPTX phases. // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ // RUN: --cuda-gpu-arch=sm_30 %s 2>&1 \ // RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s +// +// Test HIP AMDGPU -fgpu-rdc phases. +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD,BIN_AMD_RDC %s +// +// Test HIP AMDGPU -fno-gpu-rdc phases (default). // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ // RUN: --cuda-gpu-arch=gfx803 %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s +// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD,BIN_AMD_NRDC %s +// // BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) // BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) @@ -32,12 +41,17 @@ // BIN_NV-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-[[T]]) // BIN_NV-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P10]]}, ir // BIN_NV-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) -// BIN_AMD-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) +// BIN_AMD_RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) +// BIN_AMD_NRDC-DAG: [[P6:[0-9]+]]: linker, {[[P5]]}, image, (device-hip, [[ARCH]]) +// BIN_AMD_NRDC-DAG: [[P7:[0-9]+]]: offload, "device-hip (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, image +// BIN_AMD_NRDC-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, hip-fatbin, (device-hip) +// BIN_AMD_NRDC-DAG: [[P11:[0-9]+]]: offload, "host-hip (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-hip (amdgcn-amd-amdhsa)" {[[P8]]}, ir +// BIN_AMD_NRDC-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) // BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) // BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]]) -// BIN_AMD-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]]) -// BIN_AMD-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]}, -// BIN_AMD-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object +// BIN_AMD_RDC-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]]) +// BIN_AMD_RDC-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]}, +// BIN_AMD_RDC-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object // // Test single gpu architecture up to the assemble phase. @@ -46,7 +60,10 @@ // RUN: --cuda-gpu-arch=sm_30 %s -S 2>&1 \ // RUN: | FileCheck -check-prefixes=ASM,ASM_NV %s // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ -// RUN: --cuda-gpu-arch=gfx803 %s -S 2>&1 \ +// RUN: --cuda-gpu-arch=gfx803 -fgpu-rdc %s -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 -fcuda-rdc %s -S 2>&1 \ // RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s // ASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]]) // ASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) @@ -66,7 +83,7 @@ // RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ // RUN: | FileCheck -check-prefixes=BIN2,BIN2_NV %s // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ -// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \ +// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -fgpu-rdc %s 2>&1 \ // RUN: | FileCheck -check-prefixes=BIN2,BIN2_AMD %s // BIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) // BIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) @@ -105,7 +122,7 @@ // RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \ // RUN: | FileCheck -check-prefixes=ASM2,ASM2_NV %s // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ -// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \ +// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -fgpu-rdc %s -S 2>&1 \ // RUN: | FileCheck -check-prefixes=ASM2,ASM2_AMD %s // ASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH1:sm_30]]) // ASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH1:gfx803]]) Index: test/Driver/hip-output-file-name.hip =================================================================== --- test/Driver/hip-output-file-name.hip +++ test/Driver/hip-output-file-name.hip @@ -2,7 +2,7 @@ // REQUIRES: x86-registered-target // REQUIRES: amdgpu-registered-target -// RUN: %clang -### -c -target x86_64-linux-gnu \ +// RUN: %clang -### -c -target x86_64-linux-gnu -fgpu-rdc \ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ // RUN: 2>&1 | FileCheck %s Index: test/Driver/hip-toolchain-no-rdc.hip =================================================================== --- /dev/null +++ test/Driver/hip-toolchain-no-rdc.hip @@ -0,0 +1,150 @@ +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: amdgpu-registered-target + +// RUN: %clang -### -target x86_64-linux-gnu -fno-gpu-rdc \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ +// RUN: --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \ +// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ +// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \ +// RUN: -fuse-ld=lld \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s + +// +// Compile device code in a.cu to code object for gfx803. +// + +// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" +// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" +// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803" +// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden" +// CHECK-SAME: {{.*}} "-o" [[A_BC_803:".*bc"]] "-x" "hip" +// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]] + +// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC_803]] +// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc" +// CHECK-SAME: "-o" [[LINKED_BC_DEV_A_803:".*-gfx803-linked-.*bc"]] + +// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_A_803]] "-mtriple=amdgcn-amd-amdhsa" +// CHECK-SAME: "-mcpu=gfx803" +// CHECK-SAME: "-o" [[OPT_BC_DEV_A_803:".*-gfx803-optimized.*bc"]] + +// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_A_803]] "-mtriple=amdgcn-amd-amdhsa" +// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV_A_803:".*-gfx803-.*o"]] + +// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared" +// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]] + +// +// Compile device code in a.cu to code object for gfx900. +// + +// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" +// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" +// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900" +// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden" +// CHECK-SAME: {{.*}} "-o" [[A_BC_900:".*bc"]] "-x" "hip" +// CHECK-SAME: {{.*}} [[A_SRC]] + +// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC_900]] +// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc" +// CHECK-SAME: "-o" [[LINKED_BC_DEV_A_900:".*-gfx900-linked-.*bc"]] + +// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_A_900]] "-mtriple=amdgcn-amd-amdhsa" +// CHECK-SAME: "-mcpu=gfx900" +// CHECK-SAME: "-o" [[OPT_BC_DEV_A_900:".*-gfx900-optimized.*bc"]] + +// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_A_900]] "-mtriple=amdgcn-amd-amdhsa" +// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV_A_900:".*-gfx900-.*o"]] + +// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared" +// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]] + +// +// Bundle and embed device code in host object for a.cu. +// + +// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_A_803]],[[IMG_DEV_A_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]" + +// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" +// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj" +// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" +// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip" +// CHECK-SAME: {{.*}} [[A_SRC]] +// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]" + +// +// Compile device code in b.hip to code object for gfx803. +// + +// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" +// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" +// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803" +// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden" +// CHECK-SAME: {{.*}} "-o" [[B_BC_803:".*bc"]] "-x" "hip" +// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]] + +// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[B_BC_803]] +// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc" +// CHECK-SAME: "-o" [[LINKED_BC_DEV_B_803:".*-gfx803-linked-.*bc"]] + +// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_B_803]] "-mtriple=amdgcn-amd-amdhsa" +// CHECK-SAME: "-mcpu=gfx803" +// CHECK-SAME: "-o" [[OPT_BC_DEV_B_803:".*-gfx803-optimized.*bc"]] + +// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_B_803]] "-mtriple=amdgcn-amd-amdhsa" +// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV_B_803:".*-gfx803-.*o"]] + +// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared" +// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]] + +// +// Compile device code in b.hip to code object for gfx900. +// + +// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" +// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" +// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900" +// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden" +// CHECK-SAME: {{.*}} "-o" [[B_BC_900:".*bc"]] "-x" "hip" +// CHECK-SAME: {{.*}} [[B_SRC]] + +// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[B_BC_900]] +// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc" +// CHECK-SAME: "-o" [[LINKED_BC_DEV_B_900:".*-gfx900-linked-.*bc"]] + +// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_B_900]] "-mtriple=amdgcn-amd-amdhsa" +// CHECK-SAME: "-mcpu=gfx900" +// CHECK-SAME: "-o" [[OPT_BC_DEV_B_900:".*-gfx900-optimized.*bc"]] + +// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_B_900]] "-mtriple=amdgcn-amd-amdhsa" +// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV_B_900:".*-gfx900-.*o"]] + +// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared" +// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]] + +// +// Bundle and embed device code in host object for b.hip. +// + +// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_B_803]],[[IMG_DEV_B_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]" + +// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" +// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj" +// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" +// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip" +// CHECK-SAME: {{.*}} [[B_SRC]] +// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]" + +// +// Link host objects. +// + +// CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]] +// CHECK-NOT: "-T" "{{.*}}.lk" Index: test/Driver/hip-toolchain-rdc.hip =================================================================== --- /dev/null +++ test/Driver/hip-toolchain-rdc.hip @@ -0,0 +1,86 @@ +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: amdgpu-registered-target + +// RUN: %clang -### -target x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ +// RUN: --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \ +// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ +// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \ +// RUN: -fuse-ld=lld -fgpu-rdc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s + +// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" +// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" +// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803" +// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fvisibility" "hidden" +// CHECK-SAME: {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip" +// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]] + +// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" +// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" +// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803" +// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fvisibility" "hidden" +// CHECK-SAME: {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip" +// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]] + +// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC]] [[B_BC]] +// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc" +// CHECK-SAME: "-o" [[LINKED_BC_DEV1:".*-gfx803-linked-.*bc"]] + +// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV1]] "-mtriple=amdgcn-amd-amdhsa" +// CHECK-SAME: "-mcpu=gfx803" +// CHECK-SAME: "-o" [[OPT_BC_DEV1:".*-gfx803-optimized.*bc"]] + +// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV1]] "-mtriple=amdgcn-amd-amdhsa" +// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV1:".*-gfx803-.*o"]] + +// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared" +// CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[OBJ_DEV1]] + +// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" +// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" +// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900" +// CHECK-SAME: "-fcuda-is-device" {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip" +// CHECK-SAME: {{.*}} [[A_SRC]] + +// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" +// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" +// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900" +// CHECK-SAME: "-fcuda-is-device" {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip" +// CHECK-SAME: {{.*}} [[B_SRC]] + +// CHECK: [[LLVM_LINK]] [[A_BC]] [[B_BC]] +// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc" +// CHECK-SAME: "-o" [[LINKED_BC_DEV2:".*-gfx900-linked-.*bc"]] + +// CHECK: [[OPT]] [[LINKED_BC_DEV2]] "-mtriple=amdgcn-amd-amdhsa" +// CHECK-SAME: "-mcpu=gfx900" +// CHECK-SAME: "-o" [[OPT_BC_DEV2:".*-gfx900-optimized.*bc"]] + +// CHECK: [[LLC]] [[OPT_BC_DEV2]] "-mtriple=amdgcn-amd-amdhsa" +// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV2:".*-gfx900-.*o"]] + +// CHECK: [[LLD]] "-flavor" "gnu" "--no-undefined" "-shared" +// CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[OBJ_DEV2]] + +// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" +// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj" +// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" +// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip" +// CHECK-SAME: {{.*}} [[A_SRC]] + +// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" +// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj" +// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" +// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip" +// CHECK-SAME: {{.*}} [[B_SRC]] + +// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]" + +// CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]] +// CHECK-SAME: {{.*}} "-T" "{{.*}}.lk" Index: test/Driver/hip-toolchain.hip =================================================================== --- test/Driver/hip-toolchain.hip +++ /dev/null @@ -1,86 +0,0 @@ -// REQUIRES: clang-driver -// REQUIRES: x86-registered-target -// REQUIRES: amdgpu-registered-target - -// RUN: %clang -### -target x86_64-linux-gnu \ -// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ -// RUN: --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \ -// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ -// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \ -// RUN: -fuse-ld=lld \ -// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ -// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ -// RUN: 2>&1 | FileCheck %s - -// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" -// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" -// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803" -// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden" -// CHECK-SAME: {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip" -// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]] - -// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" -// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" -// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803" -// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden" -// CHECK-SAME: {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip" -// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]] - -// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC]] [[B_BC]] -// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc" -// CHECK-SAME: "-o" [[LINKED_BC_DEV1:".*-gfx803-linked-.*bc"]] - -// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV1]] "-mtriple=amdgcn-amd-amdhsa" -// CHECK-SAME: "-mcpu=gfx803" -// CHECK-SAME: "-o" [[OPT_BC_DEV1:".*-gfx803-optimized.*bc"]] - -// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV1]] "-mtriple=amdgcn-amd-amdhsa" -// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV1:".*-gfx803-.*o"]] - -// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared" -// CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[OBJ_DEV1]] - -// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" -// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" -// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900" -// CHECK-SAME: "-fcuda-is-device" {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip" -// CHECK-SAME: {{.*}} [[A_SRC]] - -// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" -// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" -// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900" -// CHECK-SAME: "-fcuda-is-device" {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip" -// CHECK-SAME: {{.*}} [[B_SRC]] - -// CHECK: [[LLVM_LINK]] [[A_BC]] [[B_BC]] -// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc" -// CHECK-SAME: "-o" [[LINKED_BC_DEV2:".*-gfx900-linked-.*bc"]] - -// CHECK: [[OPT]] [[LINKED_BC_DEV2]] "-mtriple=amdgcn-amd-amdhsa" -// CHECK-SAME: "-mcpu=gfx900" -// CHECK-SAME: "-o" [[OPT_BC_DEV2:".*-gfx900-optimized.*bc"]] - -// CHECK: [[LLC]] [[OPT_BC_DEV2]] "-mtriple=amdgcn-amd-amdhsa" -// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV2:".*-gfx900-.*o"]] - -// CHECK: [[LLD]] "-flavor" "gnu" "--no-undefined" "-shared" -// CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[OBJ_DEV2]] - -// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" -// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj" -// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" -// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip" -// CHECK-SAME: {{.*}} [[A_SRC]] - -// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" -// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj" -// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" -// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip" -// CHECK-SAME: {{.*}} [[B_SRC]] - -// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" -// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*o]]" - -// CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]] -// CHECK-SAME: {{.*}} "-T" "{{.*}}.lk" Index: test/SemaCUDA/extern-shared.cu =================================================================== --- test/SemaCUDA/extern-shared.cu +++ test/SemaCUDA/extern-shared.cu @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -verify %s // RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -verify %s -// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-rdc -verify=rdc %s -// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -fcuda-rdc -verify=rdc %s +// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fgpu-rdc -verify=rdc %s +// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -fgpu-rdc -verify=rdc %s // Most of these declarations are fine in separate compilation mode.