Index: cfe/trunk/include/clang/Driver/Options.td
===================================================================
--- cfe/trunk/include/clang/Driver/Options.td
+++ cfe/trunk/include/clang/Driver/Options.td
@@ -586,6 +586,8 @@
 def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>,
   HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">;
 def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">;
+def fhip_dump_offload_linker_script : Flag<["-"], "fhip-dump-offload-linker-script">,
+  Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>;
 def dA : Flag<["-"], "dA">, Group<d_Group>;
 def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
   HelpText<"Print macro definitions in -E mode in addition to normal output">;
Index: cfe/trunk/lib/CodeGen/CGCUDANV.cpp
===================================================================
--- cfe/trunk/lib/CodeGen/CGCUDANV.cpp
+++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp
@@ -27,6 +27,8 @@
 using namespace CodeGen;
 
 namespace {
+constexpr unsigned CudaFatMagic = 0x466243b1;
+constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
 
 class CGNVCUDARuntime : public CGCUDARuntime {
 
@@ -310,19 +312,20 @@
 /// }
 /// \endcode
 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
+  bool IsHIP = CGM.getLangOpts().HIP;
   // No need to generate ctors/dtors if there is no GPU binary.
-  std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
-  if (GpuBinaryFileName.empty())
+  StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
+  if (CudaGpuBinaryFileName.empty() && !IsHIP)
     return nullptr;
 
-  // void __cuda_register_globals(void* handle);
+  // void __{cuda|hip}_register_globals(void* handle);
   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
   // We always need a function to pass in as callback. Create a dummy
   // implementation if we don't need to register anything.
   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
 
-  // void ** __cudaRegisterFatBinary(void *);
+  // void ** __{cuda|hip}RegisterFatBinary(void *);
   llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
       addUnderscoredPrefixToName("RegisterFatBinary"));
@@ -334,12 +337,16 @@
   // global variable and save a reference in GpuBinaryHandle to be cleaned up
   // in destructor on exit. Then associate all known kernels with the GPU binary
   // handle so CUDA runtime can figure out what to call on the GPU side.
-  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
-      llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
-  if (std::error_code EC = GpuBinaryOrErr.getError()) {
-    CGM.getDiags().Report(diag::err_cannot_open_file)
-        << GpuBinaryFileName << EC.message();
-    return nullptr;
+  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary;
+  if (!IsHIP) {
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
+        llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
+    if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
+      CGM.getDiags().Report(diag::err_cannot_open_file)
+          << CudaGpuBinaryFileName << EC.message();
+      return nullptr;
+    }
+    CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
   }
 
   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
@@ -353,28 +360,60 @@
   CtorBuilder.SetInsertPoint(CtorEntryBB);
 
   const char *FatbinConstantName;
-  if (RelocatableDeviceCode)
+  const char *FatbinSectionName;
+  const char *ModuleIDSectionName;
+  StringRef ModuleIDPrefix;
+  llvm::Constant *FatBinStr;
+  unsigned FatMagic;
+  if (IsHIP) {
+    FatbinConstantName = ".hip_fatbin";
+    FatbinSectionName = ".hipFatBinSegment";
+
+    ModuleIDSectionName = "__hip_module_id";
+    ModuleIDPrefix = "__hip_";
+
+    // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin.
+    // The external symbol is supposed to contain the fat binary but will be
+    // populated somewhere else, e.g. by lld through link script.
+    FatBinStr = new llvm::GlobalVariable(
+        CGM.getModule(), CGM.Int8Ty,
+        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
+        "__hip_fatbin", nullptr,
+        llvm::GlobalVariable::NotThreadLocal);
+    cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
+
+    FatMagic = HIPFatMagic;
+  } else {
+    if (RelocatableDeviceCode)
+      // TODO: Figure out how this is called on mac OS!
+      FatbinConstantName = "__nv_relfatbin";
+    else
+      FatbinConstantName =
+          CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+    // NVIDIA's cuobjdump looks for fatbins in this section.
+    FatbinSectionName =
+        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+
     // TODO: Figure out how this is called on mac OS!
-    FatbinConstantName = "__nv_relfatbin";
-  else
-    FatbinConstantName =
-        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
-  // NVIDIA's cuobjdump looks for fatbins in this section.
-  const char *FatbinSectionName =
-      CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
-  // TODO: Figure out how this is called on mac OS!
-  const char *NVModuleIDSectionName = "__nv_module_id";
+    ModuleIDSectionName = "__nv_module_id";
+    ModuleIDPrefix = "__nv_";
+
+    // For CUDA, create a string literal containing the fat binary loaded from
+    // the given file.
+    FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
+                                   FatbinConstantName, 8);
+    FatMagic = CudaFatMagic;
+  }
 
   // Create initialized wrapper structure that points to the loaded GPU binary
   ConstantInitBuilder Builder(CGM);
   auto Values = Builder.beginStruct(FatbinWrapperTy);
   // Fatbin wrapper magic.
-  Values.addInt(IntTy, 0x466243b1);
+  Values.addInt(IntTy, FatMagic);
   // Fatbin version.
   Values.addInt(IntTy, 1);
   // Data.
-  Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
-                                FatbinConstantName, 8));
+  Values.add(FatBinStr);
   // Unused in fatbin v1.
   Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
@@ -382,10 +421,10 @@
       /*constant*/ true);
   FatbinWrapper->setSection(FatbinSectionName);
 
-  // Register binary with CUDA runtime. This is substantially different in
+  // Register binary with CUDA/HIP runtime. This is substantially different in
   // default mode vs. separate compilation!
   if (!RelocatableDeviceCode) {
-    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
+    // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper);
     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
         RegisterFatbinFunc,
         CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
@@ -397,34 +436,34 @@
     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
                                    CGM.getPointerAlign());
 
-    // Call __cuda_register_globals(GpuBinaryHandle);
+    // Call __{cuda|hip}_register_globals(GpuBinaryHandle);
     if (RegisterGlobalsFunc)
       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
   } else {
     // Generate a unique module ID.
-    SmallString<64> NVModuleID;
-    llvm::raw_svector_ostream OS(NVModuleID);
-    OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID());
-    llvm::Constant *NVModuleIDConstant =
-        makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32);
+    SmallString<64> ModuleID;
+    llvm::raw_svector_ostream OS(ModuleID);
+    OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID());
+    llvm::Constant *ModuleIDConstant =
+        makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
 
-    // Create an alias for the FatbinWrapper that nvcc will look for.
+    // Create an alias for the FatbinWrapper that nvcc or hip backend will
+    // look for.
     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
-                              Twine("__fatbinwrap") + NVModuleID,
-                              FatbinWrapper);
+                              Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
 
-    // void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *,
+    // void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
     // void *, void (*)(void **))
     SmallString<128> RegisterLinkedBinaryName(
         addUnderscoredPrefixToName("RegisterLinkedBinary"));
-    RegisterLinkedBinaryName += NVModuleID;
+    RegisterLinkedBinaryName += ModuleID;
     llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
 
     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
     llvm::Value *Args[] = {RegisterGlobalsFunc,
                            CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
-                           NVModuleIDConstant,
+                           ModuleIDConstant,
                            makeDummyFunction(getCallbackFnTy())};
     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
   }
Index: cfe/trunk/lib/Driver/ToolChains/CommonArgs.h
===================================================================
--- cfe/trunk/lib/Driver/ToolChains/CommonArgs.h
+++ cfe/trunk/lib/Driver/ToolChains/CommonArgs.h
@@ -52,6 +52,12 @@
                            llvm::opt::ArgStringList &CmdArgs,
                            const JobAction &JA);
 
+void AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
+                        const InputInfo &Output, const InputInfoList &Inputs,
+                        const llvm::opt::ArgList &Args,
+                        llvm::opt::ArgStringList &CmdArgs, const JobAction &JA,
+                        const Tool &T);
+
 const char *SplitDebugName(const llvm::opt::ArgList &Args,
                            const InputInfo &Input);
 
Index: cfe/trunk/lib/Driver/ToolChains/CommonArgs.cpp
===================================================================
--- cfe/trunk/lib/Driver/ToolChains/CommonArgs.cpp
+++ cfe/trunk/lib/Driver/ToolChains/CommonArgs.cpp
@@ -146,12 +146,14 @@
   Args.AddAllArgValues(CmdArgs, options::OPT_Zlinker_input);
 
   for (const auto &II : Inputs) {
-    // If the current tool chain refers to an OpenMP offloading host, we should
-    // ignore inputs that refer to OpenMP offloading devices - they will be
-    // embedded according to a proper linker script.
+    // If the current tool chain refers to an OpenMP or HIP offloading host, we
+    // should ignore inputs that refer to OpenMP or HIP offloading devices -
+    // they will be embedded according to a proper linker script.
     if (auto *IA = II.getAction())
-      if (JA.isHostOffloading(Action::OFK_OpenMP) &&
-          IA->isDeviceOffloading(Action::OFK_OpenMP))
+      if ((JA.isHostOffloading(Action::OFK_OpenMP) &&
+           IA->isDeviceOffloading(Action::OFK_OpenMP)) ||
+          (JA.isHostOffloading(Action::OFK_HIP) &&
+           IA->isDeviceOffloading(Action::OFK_HIP)))
         continue;
 
     if (!TC.HasNativeLLVMSupport() && types::isLLVMIR(II.getType()))
@@ -1288,6 +1290,124 @@
   Lksf << LksBuffer;
 }
 
+/// Add HIP linker script arguments at the end of the argument list so that
+/// the fat binary is built by embedding the device images into the host. The
+/// linker script also defines a symbol required by the code generation so that
+/// the image can be retrieved at runtime. This should be used only in tool
+/// chains that support linker scripts.
+void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
+                               const InputInfo &Output,
+                               const InputInfoList &Inputs, const ArgList &Args,
+                               ArgStringList &CmdArgs, const JobAction &JA,
+                               const Tool &T) {
+
+  // If this is not a HIP host toolchain, we don't need to do anything.
+  if (!JA.isHostOffloading(Action::OFK_HIP))
+    return;
+
+  // Create temporary linker script. Keep it if save-temps is enabled.
+  const char *LKS;
+  SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
+  if (C.getDriver().isSaveTempsEnabled()) {
+    llvm::sys::path::replace_extension(Name, "lk");
+    LKS = C.getArgs().MakeArgString(Name.c_str());
+  } else {
+    llvm::sys::path::replace_extension(Name, "");
+    Name = C.getDriver().GetTemporaryPath(Name, "lk");
+    LKS = C.addTempFile(C.getArgs().MakeArgString(Name.c_str()));
+  }
+
+  // Add linker script option to the command.
+  CmdArgs.push_back("-T");
+  CmdArgs.push_back(LKS);
+
+  // Create a buffer to write the contents of the linker script.
+  std::string LksBuffer;
+  llvm::raw_string_ostream LksStream(LksBuffer);
+
+  // Get the HIP offload tool chain.
+  auto *HIPTC = static_cast<const toolchains::CudaToolChain *>(
+      C.getSingleOffloadToolChain<Action::OFK_HIP>());
+  assert(HIPTC->getTriple().getArch() == llvm::Triple::amdgcn &&
+         "Wrong platform");
+
+  // Construct clang-offload-bundler command to bundle object files for
+  // for different GPU archs.
+  ArgStringList BundlerArgs;
+  BundlerArgs.push_back(Args.MakeArgString("-type=o"));
+
+  // ToDo: Remove the dummy host binary entry which is required by
+  // clang-offload-bundler.
+  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
+  std::string BundlerInputArg = "-inputs=/dev/null";
+
+  for (const auto &II : Inputs) {
+    const Action *A = II.getAction();
+    // Is this a device linking action?
+    if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
+      BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
+                         StringRef(A->getOffloadingArch()).str();
+      BundlerInputArg = BundlerInputArg + "," + II.getFilename();
+    }
+  }
+  BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
+  BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
+
+  std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o");
+  const char *BundleFile =
+      C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str()));
+  auto BundlerOutputArg =
+      Args.MakeArgString(std::string("-outputs=").append(BundleFile));
+  BundlerArgs.push_back(BundlerOutputArg);
+
+  SmallString<128> BundlerPath(C.getDriver().Dir);
+  llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
+  const char *Bundler = Args.MakeArgString(BundlerPath);
+  C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+
+  // Add commands to embed target binaries. We ensure that each section and
+  // image is 16-byte aligned. This is not mandatory, but increases the
+  // likelihood of data to be aligned with a cache block in several main host
+  // machines.
+  LksStream << "/*\n";
+  LksStream << "       HIP Offload Linker Script\n";
+  LksStream << " *** Automatically generated by Clang ***\n";
+  LksStream << "*/\n";
+  LksStream << "TARGET(binary)\n";
+  LksStream << "INPUT(" << BundleFileName << ")\n";
+  LksStream << "SECTIONS\n";
+  LksStream << "{\n";
+  LksStream << "  .hip_fatbin :\n";
+  LksStream << "  ALIGN(0x10)\n";
+  LksStream << "  {\n";
+  LksStream << "    PROVIDE_HIDDEN(__hip_fatbin = .);\n";
+  LksStream << "    " << BundleFileName << "\n";
+  LksStream << "  }\n";
+  LksStream << "}\n";
+  LksStream << "INSERT BEFORE .data\n";
+  LksStream.flush();
+
+  // Dump the contents of the linker script if the user requested that. We
+  // support this option to enable testing of behavior with -###.
+  if (C.getArgs().hasArg(options::OPT_fhip_dump_offload_linker_script))
+    llvm::errs() << LksBuffer;
+
+  // If this is a dry run, do not create the linker script file.
+  if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
+    return;
+
+  // Open script file and write the contents.
+  std::error_code EC;
+  llvm::raw_fd_ostream Lksf(LKS, EC, llvm::sys::fs::F_None);
+
+  if (EC) {
+    C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
+    return;
+  }
+
+  Lksf << LksBuffer;
+}
+
 SmallString<128> tools::getStatsFileName(const llvm::opt::ArgList &Args,
                                          const InputInfo &Output,
                                          const InputInfo &Input,
Index: cfe/trunk/lib/Driver/ToolChains/Gnu.cpp
===================================================================
--- cfe/trunk/lib/Driver/ToolChains/Gnu.cpp
+++ cfe/trunk/lib/Driver/ToolChains/Gnu.cpp
@@ -535,6 +535,10 @@
   // Add OpenMP offloading linker script args if required.
   AddOpenMPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA);
 
+  // Add HIP offloading linker script args if required.
+  AddHIPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA,
+                     *this);
+
   C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
 }
 
Index: cfe/trunk/test/CodeGenCUDA/device-stub.cu
===================================================================
--- cfe/trunk/test/CodeGenCUDA/device-stub.cu
+++ cfe/trunk/test/CodeGenCUDA/device-stub.cu
@@ -1,13 +1,13 @@
 // RUN: echo "GPU binary would be here" > %t
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o - \
-// RUN:   | FileCheck %s --check-prefixes=ALL,NORDC,CUDA
+// RUN:   | FileCheck %s --check-prefixes=ALL,NORDC,CUDA,CUDANORDC
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
-// RUN:   | FileCheck %s -check-prefix=NOGLOBALS
+// RUN:   | FileCheck %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - \
-// RUN:   | FileCheck %s --check-prefixes=ALL,RDC,CUDA
+// RUN:   | FileCheck %s --check-prefixes=ALL,RDC,CUDA,CUDARDC
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
 // RUN:   | FileCheck %s -check-prefix=NOGPUBIN
 
@@ -16,10 +16,10 @@
 // RUN:   | FileCheck %s --check-prefixes=ALL,NORDC,HIP
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS -x hip \
-// RUN:   | FileCheck %s -check-prefix=NOGLOBALS
+// RUN:   | FileCheck %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
-// RUN:   | FileCheck %s --check-prefixes=ALL,RDC,HIP
+// RUN:   | FileCheck %s --check-prefixes=ALL,RDC,HIP,HIPRDC
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
 // RUN:   | FileCheck %s -check-prefix=NOGPUBIN
 
@@ -64,21 +64,26 @@
 // * constant unnamed string with the kernel name
 // ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
 // * constant unnamed string with GPU binary
-// ALL: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
-// NORDC-SAME: section ".nv_fatbin", align 8
-// RDC-SAME: section "__nv_relfatbin", align 8
+// HIP: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
+// CUDA: @[[FATBIN:.*]] = private unnamed_addr constant{{.*GPU binary would be here.*}}\00",
+// CUDANORDC-SAME: section ".nv_fatbin", align 8
+// CUDARDC-SAME: section "__nv_relfatbin", align 8
 // * constant struct that wraps GPU binary
-// CUDA: @__[[PREFIX:cuda]]_fatbin_wrapper = internal constant
-// CUDA-SAME: { i32, i32, i8*, i8* }
-// HIP: @__[[PREFIX:hip]]_fatbin_wrapper = internal constant
-// HIP-SAME:  { i32, i32, i8*, i8* }
-// ALL-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
-// ALL-SAME: section ".nvFatBinSegment"
+// ALL: @__[[PREFIX:cuda|hip]]_fatbin_wrapper = internal constant
+// ALL-SAME: { i32, i32, i8*, i8* }
+// CUDA-SAME: { i32 1180844977, i32 1,
+// HIP-SAME: { i32 1212764230, i32 1,
+// CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
+// HIP-SAME:  i8* @[[FATBIN]],
+// ALL-SAME: i8* null }
+// CUDA-SAME: section ".nvFatBinSegment"
+// HIP-SAME: section ".hipFatBinSegment"
 // * variable to save GPU binary handle after initialization
 // NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
 // * constant unnamed string with NVModuleID
 // RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant
-// RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
+// CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
+// HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32
 // * Make sure our constructor was added to global ctor list.
 // ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
 // * In separate mode we also register a destructor.
@@ -136,9 +141,10 @@
 // There should be no __[[PREFIX]]_register_globals if we have no
 // device-side globals, but we still need to register GPU binary.
 // Skip GPU binary string first.
-// NOGLOBALS: @0 = private unnamed_addr constant{{.*}}
+// CUDANOGLOBALS: @{{.*}} = private unnamed_addr constant{{.*}}
+// HIPNOGLOBALS: @{{.*}} = external constant{{.*}}
 // NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals
-// NOGLOBALS: define internal void @__[[PREFIX:.*]]_module_ctor
+// NOGLOBALS: define internal void @__[[PREFIX:cuda|hip]]_module_ctor
 // NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
 // NOGLOBALS-NOT: call void @__[[PREFIX]]_register_globals
 // NOGLOBALS: define internal void @__[[PREFIX]]_module_dtor