Index: include/clang/Basic/LangOptions.def
===================================================================
--- include/clang/Basic/LangOptions.def
+++ include/clang/Basic/LangOptions.def
@@ -211,7 +211,7 @@
 LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
 LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
-LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code")
+LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 
 LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
 LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -584,9 +584,9 @@
 def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
   Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
 def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
-def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option]>,
+def fgpu_rdc : Flag<["-"], "fgpu-rdc">, Flags<[CC1Option]>,
   HelpText<"Generate relocatable device code, also known as separate compilation mode.">;
-def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">;
+def fno_gpu_rdc : Flag<["-"], "fno-gpu-rdc">;
 def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>,
   HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">;
 def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">;
Index: include/clang/Driver/Types.def
===================================================================
--- include/clang/Driver/Types.def
+++ include/clang/Driver/Types.def
@@ -101,4 +101,5 @@
 TYPE("dSYM",                     dSYM,         INVALID,         "dSYM",  "A")
 TYPE("dependencies",             Dependencies, INVALID,         "d",     "")
 TYPE("cuda-fatbin",              CUDA_FATBIN,  INVALID,         "fatbin","A")
+TYPE("hip-fatbin",               HIP_FATBIN,   INVALID,         "hipfb", "A")
 TYPE("none",                     Nothing,      INVALID,         nullptr, "u")
Index: lib/AST/Decl.cpp
===================================================================
--- lib/AST/Decl.cpp
+++ lib/AST/Decl.cpp
@@ -2451,7 +2451,7 @@
   //
   // With CUDA relocatable device code enabled, these variables don't get
   // special handling; they're treated like regular extern variables.
-  if (LangOpts.CUDA && !LangOpts.CUDARelocatableDeviceCode &&
+  if (LangOpts.CUDA && !LangOpts.GPURelocatableDeviceCode &&
       hasExternalStorage() && hasAttr<CUDASharedAttr>() &&
       isa<IncompleteArrayType>(getType()))
     return true;
Index: lib/CodeGen/CGCUDANV.cpp
===================================================================
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -137,7 +137,7 @@
 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
       TheModule(CGM.getModule()),
-      RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) {
+      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode) {
   CodeGen::CodeGenTypes &Types = CGM.getTypes();
   ASTContext &Ctx = CGM.getContext();
 
@@ -353,8 +353,8 @@
   // global variable and save a reference in GpuBinaryHandle to be cleaned up
   // in destructor on exit. Then associate all known kernels with the GPU binary
   // handle so CUDA runtime can figure out what to call on the GPU side.
-  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary;
-  if (!IsHIP) {
+  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
+  if (!CudaGpuBinaryFileName.empty()) {
     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
         llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
@@ -388,15 +388,23 @@
     ModuleIDSectionName = "__hip_module_id";
     ModuleIDPrefix = "__hip_";
 
-    // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin.
-    // The external symbol is supposed to contain the fat binary but will be
-    // populated somewhere else, e.g. by lld through link script.
-    FatBinStr = new llvm::GlobalVariable(
+    if (CudaGpuBinary) {
+      // If fatbin is available from early finalization, create a string
+      // literal containing the fat binary loaded from the given file.
+      FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
+                                     FatbinConstantName, 8);
+    } else {
+      // If fatbin is not available, create an external symbol
+      // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
+      // to contain the fat binary but will be populated somewhere else,
+      // e.g. by lld through link script.
+      FatBinStr = new llvm::GlobalVariable(
         CGM.getModule(), CGM.Int8Ty,
         /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
         "__hip_fatbin", nullptr,
         llvm::GlobalVariable::NotThreadLocal);
-    cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
+      cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
+    }
 
     FatMagic = HIPFatMagic;
   } else {
@@ -447,6 +455,8 @@
   // thread safety of the loaded program. Therefore we can assume sequential
   // execution of constructor functions here.
   if (IsHIP) {
+    auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
+        llvm::GlobalValue::LinkOnceAnyLinkage;
     llvm::BasicBlock *IfBlock =
         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
     llvm::BasicBlock *ExitBlock =
@@ -455,12 +465,13 @@
     // of HIP ABI.
     GpuBinaryHandle = new llvm::GlobalVariable(
         TheModule, VoidPtrPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::LinkOnceAnyLinkage,
+        Linkage,
         /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
         "__hip_gpubin_handle");
     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
     // Prevent the weak symbol in different shared libraries being merged.
-    GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
+    if (Linkage != llvm::GlobalValue::InternalLinkage)
+      GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
     Address GpuBinaryAddr(
         GpuBinaryHandle,
         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -2486,11 +2486,13 @@
   class HIPActionBuilder final : public CudaActionBuilderBase {
     /// The linker inputs obtained for each device arch.
     SmallVector<ActionList, 8> DeviceLinkerInputs;
+    bool Relocatable;
 
   public:
     HIPActionBuilder(Compilation &C, DerivedArgList &Args,
                      const Driver::InputList &Inputs)
-        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}
+        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP),
+          Relocatable(false) {}
 
     bool canUseBundlerUnbundler() const override { return true; }
 
@@ -2499,23 +2501,68 @@
                          phases::ID CurPhase, phases::ID FinalPhase,
                          PhasesTy &Phases) override {
       // amdgcn does not support linking of object files, therefore we skip
-      // backend and assemble phases to output LLVM IR.
-      if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
+      // backend and assemble phases to output LLVM IR. Except for generating
+      // non-relocatable device coee, where we generate fat binary for device
+      // code and pass to host in Backend phase.
+      if (CudaDeviceActions.empty() ||
+          (CurPhase == phases::Backend && Relocatable) ||
           CurPhase == phases::Assemble)
         return ABRT_Success;
 
-      assert((CurPhase == phases::Link ||
+      assert(((CurPhase == phases::Link && Relocatable) ||
               CudaDeviceActions.size() == GpuArchList.size()) &&
              "Expecting one action per GPU architecture.");
       assert(!CompileHostOnly &&
              "Not expecting CUDA actions in host-only compilation.");
 
-      // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
-      // This happens to each device action originated from each input file.
-      // Later on, device actions in DeviceLinkerInputs are used to create
-      // device link actions in appendLinkDependences and the created device
-      // link actions are passed to the offload action as device dependence.
-      if (CurPhase == phases::Link) {
+      if (!Relocatable && CurPhase == phases::Backend) {
+        // If we are in backend phase, we attempt to generate the fat binary.
+        // We compile each arch to IR and use a link action to generate code
+        // object containing ISA. Then we use a special "link" action to create
+        // a fat binary containing all the code objects for different GPU's.
+        // The fat binary is then an input to the host action.
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          // Create a link action to link device IR with device library
+          // and generate ISA.
+          ActionList AL;
+          AL.push_back(CudaDeviceActions[I]);
+          CudaDeviceActions[I] =
+              C.MakeAction<LinkJobAction>(AL, types::TY_Image);
+
+          // OffloadingActionBuilder propagates device arch until an offload
+          // action. Since the next action for creating fatbin does
+          // not have device arch, whereas the above link action and its input
+          // have device arch, an offload action is needed to stop the null
+          // device arch of the next action being propagated to the above link
+          // action.
+          OffloadAction::DeviceDependences DDep;
+          DDep.add(*CudaDeviceActions[I], *ToolChains.front(),
+                   CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+          CudaDeviceActions[I] = C.MakeAction<OffloadAction>(
+              DDep, CudaDeviceActions[I]->getType());
+        }
+        // Create HIP fat binary with a special "link" action.
+        CudaFatBinary =
+            C.MakeAction<LinkJobAction>(CudaDeviceActions,
+                types::TY_HIP_FATBIN);
+
+        DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+               AssociatedOffloadKind);
+        // Clear the fat binary, it is already a dependence to an host
+        // action.
+        CudaFatBinary = nullptr;
+
+        // Remove the CUDA actions as they are already connected to an host
+        // action or fat binary.
+        CudaDeviceActions.clear();
+
+        return ABRT_Success;
+      } else if (CurPhase == phases::Link) {
+        // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
+        // This happens to each device action originated from each input file.
+        // Later on, device actions in DeviceLinkerInputs are used to create
+        // device link actions in appendLinkDependences and the created device
+        // link actions are passed to the offload action as device dependence.
         DeviceLinkerInputs.resize(CudaDeviceActions.size());
         auto LI = DeviceLinkerInputs.begin();
         for (auto *A : CudaDeviceActions) {
@@ -2548,6 +2595,13 @@
         ++I;
       }
     }
+
+    bool initialize() override {
+      Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+          options::OPT_fno_gpu_rdc, /*Default=*/false);
+
+      return CudaActionBuilderBase::initialize();
+    }
   };
 
   /// OpenMP action builder. The host bitcode is passed to the device frontend
Index: lib/Driver/ToolChains/Clang.cpp
===================================================================
--- lib/Driver/ToolChains/Clang.cpp
+++ lib/Driver/ToolChains/Clang.cpp
@@ -4920,16 +4920,16 @@
     CmdArgs.push_back(Args.MakeArgString(Flags));
   }
 
-  if (IsCuda) {
-    // Host-side cuda compilation receives all device-side outputs in a single
-    // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
-    if (CudaDeviceInput) {
+  // Host-side cuda compilation receives all device-side outputs in a single
+  // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
+  if ((IsCuda || IsHIP) && CudaDeviceInput) {
       CmdArgs.push_back("-fcuda-include-gpubinary");
       CmdArgs.push_back(CudaDeviceInput->getFilename());
-    }
+      if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+        CmdArgs.push_back("-fgpu-rdc");
+  }
 
-    if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
-      CmdArgs.push_back("-fcuda-rdc");
+  if (IsCuda) {
     if (Args.hasFlag(options::OPT_fcuda_short_ptr,
                      options::OPT_fno_cuda_short_ptr, false))
       CmdArgs.push_back("-fcuda-short-ptr");
Index: lib/Driver/ToolChains/CommonArgs.cpp
===================================================================
--- lib/Driver/ToolChains/CommonArgs.cpp
+++ lib/Driver/ToolChains/CommonArgs.cpp
@@ -15,6 +15,7 @@
 #include "Arch/SystemZ.h"
 #include "Arch/X86.h"
 #include "Hexagon.h"
+#include "HIP.h"
 #include "InputInfo.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/LangOptions.h"
@@ -1337,6 +1338,18 @@
   if (!JA.isHostOffloading(Action::OFK_HIP))
     return;
 
+  InputInfoList DeviceInputs;
+  for (const auto &II : Inputs) {
+    const Action *A = II.getAction();
+    // Is this a device linking action?
+    if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
+      DeviceInputs.push_back(II);
+    }
+  }
+
+  if (DeviceInputs.empty())
+    return;
+
   // Create temporary linker script. Keep it if save-temps is enabled.
   const char *LKS;
   SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
@@ -1364,39 +1377,12 @@
          "Wrong platform");
   (void)HIPTC;
 
-  // Construct clang-offload-bundler command to bundle object files for
-  // for different GPU archs.
-  ArgStringList BundlerArgs;
-  BundlerArgs.push_back(Args.MakeArgString("-type=o"));
-
-  // ToDo: Remove the dummy host binary entry which is required by
-  // clang-offload-bundler.
-  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
-  std::string BundlerInputArg = "-inputs=/dev/null";
-
-  for (const auto &II : Inputs) {
-    const Action *A = II.getAction();
-    // Is this a device linking action?
-    if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
-      BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
-                         StringRef(A->getOffloadingArch()).str();
-      BundlerInputArg = BundlerInputArg + "," + II.getFilename();
-    }
-  }
-  BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
-  BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
-
-  std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o");
+  // The output file name needs to persist through the compilation, therefore
+  // it needs to be created through MakeArgString.
+  std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "hipfb");
   const char *BundleFile =
       C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str()));
-  auto BundlerOutputArg =
-      Args.MakeArgString(std::string("-outputs=").append(BundleFile));
-  BundlerArgs.push_back(BundlerOutputArg);
-
-  SmallString<128> BundlerPath(C.getDriver().Dir);
-  llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
-  const char *Bundler = Args.MakeArgString(BundlerPath);
-  C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+  AMDGCN::constructHIPFatbinCommand(C, JA, BundleFile, DeviceInputs, Args, T);
 
   // Add commands to embed target binaries. We ensure that each section and
   // image is 16-byte aligned. This is not mandatory, but increases the
Index: lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -398,8 +398,8 @@
                                options::OPT_fnoopenmp_relocatable_target,
                                /*Default=*/true);
   else if (JA.isOffloading(Action::OFK_Cuda))
-    Relocatable = Args.hasFlag(options::OPT_fcuda_rdc,
-                               options::OPT_fno_cuda_rdc, /*Default=*/false);
+    Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+                               options::OPT_fno_gpu_rdc, /*Default=*/false);
 
   if (Relocatable)
     CmdArgs.push_back("-c");
@@ -609,9 +609,9 @@
                            options::OPT_fno_cuda_approx_transcendentals, false))
       CC1Args.push_back("-fcuda-approx-transcendentals");
 
-    if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+    if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
                            false))
-      CC1Args.push_back("-fcuda-rdc");
+      CC1Args.push_back("-fgpu-rdc");
   }
 
   if (DriverArgs.hasArg(options::OPT_nocudalib))
Index: lib/Driver/ToolChains/HIP.h
===================================================================
--- lib/Driver/ToolChains/HIP.h
+++ lib/Driver/ToolChains/HIP.h
@@ -19,6 +19,11 @@
 namespace tools {
 
 namespace AMDGCN {
+  // Construct command for creating HIP fatbin.
+  void constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
+                  StringRef OutputFileName, const InputInfoList &Inputs,
+                  const llvm::opt::ArgList &TCArgs, const Tool& T);
+
 // Runs llvm-link/opt/llc/lld, which links multiple LLVM bitcode, together with
 // device library, then compiles it to ISA in a shared object.
 class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
Index: lib/Driver/ToolChains/HIP.cpp
===================================================================
--- lib/Driver/ToolChains/HIP.cpp
+++ lib/Driver/ToolChains/HIP.cpp
@@ -184,6 +184,40 @@
   C.addCommand(llvm::make_unique<Command>(JA, *this, Lld, LldArgs, Inputs));
 }
 
+// Construct a clang-offload-bundler command to bundle code objects for
+// different GPU's into a HIP fat binary.
+void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
+                  StringRef OutputFileName, const InputInfoList &Inputs,
+                  const llvm::opt::ArgList &Args, const Tool& T) {
+  // Construct clang-offload-bundler command to bundle object files for
+  // for different GPU archs.
+  ArgStringList BundlerArgs;
+  BundlerArgs.push_back(Args.MakeArgString("-type=o"));
+
+  // ToDo: Remove the dummy host binary entry which is required by
+  // clang-offload-bundler.
+  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
+  std::string BundlerInputArg = "-inputs=/dev/null";
+
+  for (const auto &II : Inputs) {
+    const auto* A = II.getAction();
+    BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
+                       StringRef(A->getOffloadingArch()).str();
+    BundlerInputArg = BundlerInputArg + "," + II.getFilename();
+  }
+  BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
+  BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
+
+  auto BundlerOutputArg =
+      Args.MakeArgString(std::string("-outputs=").append(OutputFileName));
+  BundlerArgs.push_back(BundlerOutputArg);
+
+  SmallString<128> BundlerPath(C.getDriver().Dir);
+  llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
+  const char *Bundler = Args.MakeArgString(BundlerPath);
+  C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+}
+
 // For amdgcn the inputs of the linker job are device bitcode and output is
 // object file. It calls llvm-link, opt, llc, then lld steps.
 void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -192,6 +226,9 @@
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
 
+  if (JA.getType() == types::TY_HIP_FATBIN)
+    return constructHIPFatbinCommand(C, JA, Output.getFilename(), Inputs, Args, *this);
+
   assert(getToolChain().getTriple().getArch() == llvm::Triple::amdgcn &&
          "Unsupported target");
 
@@ -244,9 +281,9 @@
                          options::OPT_fno_cuda_approx_transcendentals, false))
     CC1Args.push_back("-fcuda-approx-transcendentals");
 
-  if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+  if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
                          false))
-    CC1Args.push_back("-fcuda-rdc");
+    CC1Args.push_back("-fgpu-rdc");
 
   // Default to "hidden" visibility, as object level linking will not be
   // supported for the foreseeable future.
Index: lib/Frontend/CompilerInvocation.cpp
===================================================================
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -2220,7 +2220,7 @@
   if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
     Opts.CUDADeviceApproxTranscendentals = 1;
 
-  Opts.CUDARelocatableDeviceCode = Args.hasArg(OPT_fcuda_rdc);
+  Opts.GPURelocatableDeviceCode = Args.hasArg(OPT_fgpu_rdc);
 
   if (Opts.ObjC1) {
     if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
Index: lib/Sema/SemaDeclAttr.cpp
===================================================================
--- lib/Sema/SemaDeclAttr.cpp
+++ lib/Sema/SemaDeclAttr.cpp
@@ -4143,7 +4143,7 @@
   const auto *VD = cast<VarDecl>(D);
   // extern __shared__ is only allowed on arrays with no length (e.g.
   // "int x[]").
-  if (!S.getLangOpts().CUDARelocatableDeviceCode && VD->hasExternalStorage() &&
+  if (!S.getLangOpts().GPURelocatableDeviceCode && VD->hasExternalStorage() &&
       !isa<IncompleteArrayType>(VD->getType())) {
     S.Diag(AL.getLoc(), diag::err_cuda_extern_shared) << VD;
     return;
Index: test/CodeGenCUDA/device-stub.cu
===================================================================
--- test/CodeGenCUDA/device-stub.cu
+++ test/CodeGenCUDA/device-stub.cu
@@ -6,22 +6,22 @@
 // RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
-// RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - \
+// RUN:     -fgpu-rdc -fcuda-include-gpubinary %t -o - \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,CUDA,CUDARDC
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
 
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o - -x hip\
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP,HIPEF
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS -x hip \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
-// RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP
+// RUN:     -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP,HIPEF
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,NORDC,HIP,HIPNEF
 
 #include "Inputs/cuda.h"
 
@@ -64,8 +64,9 @@
 // * constant unnamed string with the kernel name
 // ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
 // * constant unnamed string with GPU binary
-// HIP: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
 // CUDA: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
+// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
+// HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
 // CUDANORDC-SAME: section ".nv_fatbin", align 8
 // CUDARDC-SAME: section "__nv_relfatbin", align 8
 // * constant struct that wraps GPU binary
@@ -74,13 +75,14 @@
 // CUDA-SAME: { i32 1180844977, i32 1,
 // HIP-SAME: { i32 1212764230, i32 1,
 // CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
-// HIP-SAME:  i8* @[[FATBIN]],
+// HIPEF-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
+// HIPNEF-SAME:  i8* @[[FATBIN]],
 // ALL-SAME: i8* null }
 // CUDA-SAME: section ".nvFatBinSegment"
 // HIP-SAME: section ".hipFatBinSegment"
 // * variable to save GPU binary handle after initialization
 // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
-// HIP: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null
+// HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null
 // * constant unnamed string with NVModuleID
 // RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
 // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
@@ -157,7 +159,7 @@
 // device-side globals, but we still need to register GPU binary.
 // Skip GPU binary string first.
 // CUDANOGLOBALS: @{{.*}} = private constant{{.*}}
-// HIPNOGLOBALS: @{{.*}} = external constant{{.*}}
+// HIPNOGLOBALS: @{{.*}} = internal constant{{.*}}
 // NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals
 // NOGLOBALS: define internal void @__[[PREFIX:cuda|hip]]_module_ctor
 // NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
Index: test/Driver/cuda-external-tools.cu
===================================================================
--- test/Driver/cuda-external-tools.cu
+++ test/Driver/cuda-external-tools.cu
@@ -19,7 +19,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s
 // Generating relocatable device code
-// RUN: %clang -### -target x86_64-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
 
 // With debugging enabled, ptxas should be run with with no ptxas optimizations.
@@ -46,21 +46,21 @@
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s
 // Separate compilation targeting sm_35.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
 
 // 32-bit compile.
 // RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
 // 32-bit compile when generating relocatable device code.
-// RUN: %clang -### -target i386-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target i386-linux-gnu -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
 
 // Compile with -fintegrated-as.  This should still cause us to invoke ptxas.
 // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s
 // Check that we still pass -c when generating relocatable device code.
-// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
 
 // Check -Xcuda-ptxas and -Xcuda-fatbinary
@@ -77,11 +77,11 @@
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
 
 // Check relocatable device code generation on MacOS.
-// RUN: %clang -### -target x86_64-apple-macosx -O0 -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-apple-macosx -O0 -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
-// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
-// RUN: %clang -### -target i386-apple-macosx -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target i386-apple-macosx -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
 
 // Check that CLANG forwards the -v flag to PTXAS.
@@ -92,12 +92,12 @@
 // CHECK: "-cc1"
 // ARCH64-SAME: "-triple" "nvptx64-nvidia-cuda"
 // ARCH32-SAME: "-triple" "nvptx-nvidia-cuda"
+// RDC-SAME: "-fgpu-rdc"
+// CHECK-NOT: "-fgpu-rdc"
 // SM20-SAME: "-target-cpu" "sm_20"
 // SM35-SAME: "-target-cpu" "sm_35"
 // SM20-SAME: "-o" "[[PTXFILE:[^"]*]]"
 // SM35-SAME: "-o" "[[PTXFILE:[^"]*]]"
-// RDC-SAME: "-fcuda-rdc"
-// CHECK-NOT: "-fcuda-rdc"
 
 // Match the call to ptxas (which assembles PTX to SASS).
 // CHECK: ptxas
@@ -141,7 +141,7 @@
 // ARCH64-SAME: "-triple" "x86_64-
 // ARCH32-SAME: "-triple" "i386-
 // CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
-// RDC-SAME: "-fcuda-rdc"
-// CHECK-NOT: "-fcuda-rdc"
+// RDC-SAME: "-fgpu-rdc"
+// CHECK-NOT: "-fgpu-rdc"
 
 // CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v"
Index: test/Driver/cuda-phases.cu
===================================================================
--- test/Driver/cuda-phases.cu
+++ test/Driver/cuda-phases.cu
@@ -11,12 +11,21 @@
 //
 // Test single gpu architecture with complete compilation.
 //
+// Test CUDA NVPTX phases.
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
 // RUN: --cuda-gpu-arch=sm_30 %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s
+//
+// Test HIP AMDGPU -fgpu-rdc phases.
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD,BIN_AMD_RDC %s
+//
+// Test HIP AMDGPU -fno-gpu-rdc phases (default).
 // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
 // RUN: --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s
+// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD,BIN_AMD_NRDC %s
+//
 // BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
 // BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
 // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
@@ -32,12 +41,17 @@
 // BIN_NV-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-[[T]])
 // BIN_NV-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P10]]}, ir
 // BIN_NV-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// BIN_AMD-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN_AMD_RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN_AMD_NRDC-DAG: [[P6:[0-9]+]]: linker, {[[P5]]}, image, (device-hip, [[ARCH]])
+// BIN_AMD_NRDC-DAG: [[P7:[0-9]+]]: offload, "device-hip (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, image
+// BIN_AMD_NRDC-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, hip-fatbin, (device-hip)
+// BIN_AMD_NRDC-DAG: [[P11:[0-9]+]]: offload, "host-hip (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-hip (amdgcn-amd-amdhsa)" {[[P8]]}, ir
+// BIN_AMD_NRDC-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
 // BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
 // BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// BIN_AMD-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]])
-// BIN_AMD-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]},
-// BIN_AMD-DAG-SAME:  "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object
+// BIN_AMD_RDC-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]])
+// BIN_AMD_RDC-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]},
+// BIN_AMD_RDC-DAG-SAME:  "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object
 
 //
 // Test single gpu architecture up to the assemble phase.
@@ -46,7 +60,7 @@
 // RUN: --cuda-gpu-arch=sm_30 %s -S 2>&1 \
 // RUN: | FileCheck -check-prefixes=ASM,ASM_NV %s
 // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 %s -S 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 -fgpu-rdc %s -S 2>&1 \
 // RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s
 // ASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
 // ASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
@@ -66,7 +80,7 @@
 // RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=BIN2,BIN2_NV %s
 // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -fgpu-rdc %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=BIN2,BIN2_AMD %s
 // BIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
 // BIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
@@ -105,7 +119,7 @@
 // RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
 // RUN: | FileCheck -check-prefixes=ASM2,ASM2_NV %s
 // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -fgpu-rdc %s -S 2>&1 \
 // RUN: | FileCheck -check-prefixes=ASM2,ASM2_AMD %s
 // ASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH1:sm_30]])
 // ASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH1:gfx803]])
Index: test/Driver/hip-output-file-name.hip
===================================================================
--- test/Driver/hip-output-file-name.hip
+++ test/Driver/hip-output-file-name.hip
@@ -2,7 +2,7 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang -### -c -target x86_64-linux-gnu \
+// RUN: %clang -### -c -target x86_64-linux-gnu -fgpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
 // RUN: 2>&1 | FileCheck %s
 
Index: test/Driver/hip-toolchain-no-rdc.hip
===================================================================
--- /dev/null
+++ test/Driver/hip-toolchain-no-rdc.hip
@@ -0,0 +1,150 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang -### -target x86_64-linux-gnu -fno-gpu-rdc \
+// RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
+// RUN:   --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \
+// RUN:   -fuse-ld=lld \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+
+//
+// Compile device code in a.cu to code object for gfx803.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[A_BC_803:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC_803]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_A_803:".*-gfx803-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_A_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx803"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_A_803:".*-gfx803-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_A_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV_A_803:".*-gfx803-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
+
+//
+// Compile device code in a.cu to code object for gfx900.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[A_BC_900:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC_900]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_A_900:".*-gfx900-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_A_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx900"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_A_900:".*-gfx900-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_A_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV_A_900:".*-gfx900-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
+
+//
+// Bundle and embed device code in host object for a.cu.
+//
+
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_A_803]],[[IMG_DEV_A_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
+// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC]]
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+
+//
+// Compile device code in b.hip to code object for gfx803.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[B_BC_803:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[B_BC_803]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_B_803:".*-gfx803-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_B_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx803"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_B_803:".*-gfx803-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_B_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV_B_803:".*-gfx803-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]]
+
+//
+// Compile device code in b.hip to code object for gfx900.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[B_BC_900:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[B_BC_900]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_B_900:".*-gfx900-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_B_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx900"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_B_900:".*-gfx900-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_B_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV_B_900:".*-gfx900-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]]
+
+//
+// Bundle and embed device code in host object for b.hip.
+//
+
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_B_803]],[[IMG_DEV_B_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
+// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC]]
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+
+//
+// Link host objects.
+//
+
+// CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]]
+// CHECK-NOT: "-T" "{{.*}}.lk"
Index: test/Driver/hip-toolchain-rdc.hip
===================================================================
--- /dev/null
+++ test/Driver/hip-toolchain-rdc.hip
@@ -0,0 +1,86 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
+// RUN:   --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \
+// RUN:   -fuse-ld=lld -fgpu-rdc \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" 
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
+// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
+// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC]] [[B_BC]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV1:".*-gfx803-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV1]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx803"
+// CHECK-SAME: "-o" [[OPT_BC_DEV1:".*-gfx803-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV1]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV1:".*-gfx803-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[OBJ_DEV1]]
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" 
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900"
+// CHECK-SAME: "-fcuda-is-device" {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC]]
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900"
+// CHECK-SAME: "-fcuda-is-device" {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC]]
+
+// CHECK: [[LLVM_LINK]] [[A_BC]] [[B_BC]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV2:".*-gfx900-linked-.*bc"]]
+
+// CHECK: [[OPT]] [[LINKED_BC_DEV2]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx900"
+// CHECK-SAME: "-o" [[OPT_BC_DEV2:".*-gfx900-optimized.*bc"]]
+
+// CHECK: [[LLC]] [[OPT_BC_DEV2]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV2:".*-gfx900-.*o"]]
+
+// CHECK: [[LLD]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[OBJ_DEV2]]
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
+// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC]]
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
+// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC]]
+
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]"
+
+// CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]]
+// CHECK-SAME: {{.*}} "-T" "{{.*}}.lk"
Index: test/Driver/hip-toolchain.hip
===================================================================
--- test/Driver/hip-toolchain.hip
+++ /dev/null
@@ -1,86 +0,0 @@
-// REQUIRES: clang-driver
-// REQUIRES: x86-registered-target
-// REQUIRES: amdgpu-registered-target
-
-// RUN: %clang -### -target x86_64-linux-gnu \
-// RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
-// RUN:   --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
-// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
-// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \
-// RUN:   -fuse-ld=lld \
-// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s
-
-// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" 
-// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
-// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
-// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
-// CHECK-SAME: {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip"
-// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
-
-// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
-// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
-// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
-// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
-// CHECK-SAME: {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip"
-// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
-
-// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC]] [[B_BC]]
-// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
-// CHECK-SAME: "-o" [[LINKED_BC_DEV1:".*-gfx803-linked-.*bc"]]
-
-// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV1]] "-mtriple=amdgcn-amd-amdhsa"
-// CHECK-SAME: "-mcpu=gfx803"
-// CHECK-SAME: "-o" [[OPT_BC_DEV1:".*-gfx803-optimized.*bc"]]
-
-// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV1]] "-mtriple=amdgcn-amd-amdhsa"
-// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV1:".*-gfx803-.*o"]]
-
-// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[OBJ_DEV1]]
-
-// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" 
-// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
-// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900"
-// CHECK-SAME: "-fcuda-is-device" {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip"
-// CHECK-SAME: {{.*}} [[A_SRC]]
-
-// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
-// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
-// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900"
-// CHECK-SAME: "-fcuda-is-device" {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip"
-// CHECK-SAME: {{.*}} [[B_SRC]]
-
-// CHECK: [[LLVM_LINK]] [[A_BC]] [[B_BC]]
-// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
-// CHECK-SAME: "-o" [[LINKED_BC_DEV2:".*-gfx900-linked-.*bc"]]
-
-// CHECK: [[OPT]] [[LINKED_BC_DEV2]] "-mtriple=amdgcn-amd-amdhsa"
-// CHECK-SAME: "-mcpu=gfx900"
-// CHECK-SAME: "-o" [[OPT_BC_DEV2:".*-gfx900-optimized.*bc"]]
-
-// CHECK: [[LLC]] [[OPT_BC_DEV2]] "-mtriple=amdgcn-amd-amdhsa"
-// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV2:".*-gfx900-.*o"]]
-
-// CHECK: [[LLD]] "-flavor" "gnu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[OBJ_DEV2]]
-
-// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj"
-// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
-// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
-// CHECK-SAME: {{.*}} [[A_SRC]]
-
-// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj"
-// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
-// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
-// CHECK-SAME: {{.*}} [[B_SRC]]
-
-// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
-// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*o]]"
-
-// CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]]
-// CHECK-SAME: {{.*}} "-T" "{{.*}}.lk"
Index: test/SemaCUDA/extern-shared.cu
===================================================================
--- test/SemaCUDA/extern-shared.cu
+++ test/SemaCUDA/extern-shared.cu
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -verify %s
 // RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -verify %s
 
-// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-rdc -verify=rdc %s
-// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -fcuda-rdc -verify=rdc %s
+// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fgpu-rdc -verify=rdc %s
+// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -fgpu-rdc -verify=rdc %s
 
 // Most of these declarations are fine in separate compilation mode.