Index: include/clang/Basic/DiagnosticDriverKinds.td
===================================================================
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -31,6 +31,8 @@
 def err_drv_no_cuda_libdevice : Error<
   "cannot find libdevice for %0. Provide path to different CUDA installation "
   "via --cuda-path, or pass -nocudalib to build without linking with libdevice.">;
+def err_drv_no_hip_libdevice : Error<
+  "cannot find libdevice for %0. Please install device library.">;
 def err_drv_cuda_version_unsupported : Error<
   "GPU arch %0 is supported by CUDA versions between %1 and %2 (inclusive), "
   "but installation at %3 is %4.  Use --cuda-path to specify a different CUDA "
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -547,7 +547,10 @@
   HelpText<"Compile CUDA code for both host and device (default).  Has no "
            "effect on non-CUDA compilations.">;
 def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>,
-  HelpText<"CUDA GPU architecture (e.g. sm_35).  May be specified more than once.">;
+  HelpText<"CUDA/HIP GPU architecture (e.g. sm_35).  May be specified more than once.">;
+def : Joined<["--"], "offload-arch=">, Alias<cuda_gpu_arch_EQ>;
+def offload_archs : Joined<["--"], "offload-archs=">, Flags<[DriverOption]>,
+  HelpText<"List of offload architectures for CUDA/HIP/OpenMP (e.g. sm_35,gfx803).">;
 def no_cuda_gpu_arch_EQ : Joined<["--"], "no-cuda-gpu-arch=">, Flags<[DriverOption]>,
   HelpText<"Remove GPU architecture (e.g. sm_35) from the list of GPUs to compile for. "
            "'all' resets the list to its default value.">;
Index: include/clang/Driver/ToolChain.h
===================================================================
--- include/clang/Driver/ToolChain.h
+++ include/clang/Driver/ToolChain.h
@@ -121,11 +121,13 @@
   path_list ProgramPaths;
 
   mutable std::unique_ptr<Tool> Clang;
+  mutable std::unique_ptr<Tool> DeviceLibraryLink;
   mutable std::unique_ptr<Tool> Assemble;
   mutable std::unique_ptr<Tool> Link;
   mutable std::unique_ptr<Tool> OffloadBundler;
 
   Tool *getClang() const;
+  Tool *getDeviceLibraryLink() const;
   Tool *getAssemble() const;
   Tool *getLink() const;
   Tool *getClangAs() const;
@@ -151,6 +153,7 @@
   void setTripleEnvironment(llvm::Triple::EnvironmentType Env);
 
   virtual Tool *buildAssembler() const;
+  virtual Tool *buildDeviceLibraryLinker() const;
   virtual Tool *buildLinker() const;
   virtual Tool *getTool(Action::ActionClass AC) const;
 
Index: include/clang/Driver/Types.def
===================================================================
--- include/clang/Driver/Types.def
+++ include/clang/Driver/Types.def
@@ -46,6 +46,9 @@
 TYPE("cuda-cpp-output",          PP_CUDA,      INVALID,         "cui",   "u")
 TYPE("cuda",                     CUDA,         PP_CUDA,         "cu",    "u")
 TYPE("cuda",                     CUDA_DEVICE,  PP_CUDA,         "cu",    "")
+TYPE("hip-cpp-output",           PP_HIP,       INVALID,         "cui",   "u")
+TYPE("hip",                      HIP,          PP_HIP,          "cu",    "u")
+TYPE("hip",                      HIP_DEVICE,   PP_HIP,          "cu",    "")
 TYPE("objective-c-cpp-output",   PP_ObjC,      INVALID,         "mi",    "u")
 TYPE("objc-cpp-output",          PP_ObjC_Alias, INVALID,        "mi",    "u")
 TYPE("objective-c",              ObjC,         PP_ObjC,         "m",     "u")
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -538,13 +538,25 @@
   // CUDA
   //
   // We need to generate a CUDA toolchain if any of the inputs has a CUDA type.
+  // ToDo: Handle mixed CUDA/HIP input files and -x hip option. Diagnose
+  //       CUDA on amdgcn and HIP on nvptx.
   if (llvm::any_of(Inputs, [](std::pair<types::ID, const llvm::opt::Arg *> &I) {
         return types::isCuda(I.first);
       })) {
     const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
     const llvm::Triple &HostTriple = HostTC->getTriple();
-    llvm::Triple CudaTriple(HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda"
-                                                     : "nvptx-nvidia-cuda");
+    StringRef DeviceTripleStr;
+    if (C.getInputArgs().hasArg(options::OPT_x) &&
+        StringRef(C.getInputArgs().getLastArg(options::OPT_x)->getValue()) ==
+            "hip") {
+      // HIP is only supported on amdgcn.
+      DeviceTripleStr = "amdgcn-amd-amdhsa";
+    } else {
+      // CUDA is only supported on nvptx.
+      DeviceTripleStr = HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda"
+                                                 : "nvptx-nvidia-cuda";
+    }
+    llvm::Triple CudaTriple(DeviceTripleStr);
     // Use the CUDA and host triples as the key into the ToolChains map, because
     // the device toolchain we create depends on both.
     auto &CudaTC = ToolChains[CudaTriple.str() + "/" + HostTriple.str()];
@@ -2249,9 +2261,10 @@
         assert(!GpuArchList.empty() &&
                "We should have at least one GPU architecture.");
 
-        // If the host input is not CUDA, we don't need to bother about this
-        // input.
-        if (IA->getType() != types::TY_CUDA) {
+        // If the host input is not CUDA or HIP, we don't need to bother about
+        // this input.
+        if ((IA->getType() != types::TY_CUDA) &&
+            IA->getType() != types::TY_HIP) {
           // The builder will ignore this input.
           IsActive = false;
           return ABRT_Inactive;
@@ -2264,9 +2277,12 @@
           return ABRT_Success;
 
         // Replicate inputs for each GPU architecture.
-        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-          CudaDeviceActions.push_back(C.MakeAction<InputAction>(
-              IA->getInputArg(), types::TY_CUDA_DEVICE));
+        auto Ty = IA->getType() == types::TY_HIP ? types::TY_HIP_DEVICE
+                                                 : types::TY_CUDA_DEVICE;
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          CudaDeviceActions.push_back(
+              C.MakeAction<InputAction>(IA->getInputArg(), Ty));
+        }
 
         return ABRT_Success;
       }
@@ -2314,7 +2330,8 @@
 
       const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
       assert(HostTC && "No toolchain for host compilation.");
-      if (HostTC->getTriple().isNVPTX()) {
+      if (HostTC->getTriple().isNVPTX() ||
+          HostTC->getTriple().getArch() == llvm::Triple::amdgcn) {
         // We do not support targeting NVPTX for host compilation. Throw
         // an error and abort pipeline construction early so we don't trip
         // asserts that assume device-side compilation.
@@ -3212,6 +3229,9 @@
   bool SaveTemps;
   bool EmbedBitcode;
 
+  /// Type of the input file for the tool
+  types::ID InputType;
+
   /// Get previous dependent action or null if that does not exist. If
   /// \a CanBeCollapsed is false, that action must be legal to collapse or
   /// null will be returned.
@@ -3269,6 +3289,8 @@
   bool canCollapsePreprocessorAction() const {
     return !C.getArgs().hasArg(options::OPT_no_integrated_cpp) &&
            !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps &&
+           (InputType != types::TY_LLVM_IR) &&
+           (InputType != types::TY_LLVM_BC) &&
            !C.getArgs().hasArg(options::OPT_rewrite_objc);
   }
 
@@ -3293,6 +3315,11 @@
                                     ActionInfo[I].SavedOffloadAction.end());
   }
 
+  static bool isAMDGPUCUDAOffloading(const Action *A, llvm::Triple T) {
+    return A->isOffloading(Action::OFK_Cuda) &&
+           (StringRef(A->getOffloadingArch()).startswith("gfx") ||
+            T.getArch() == llvm::Triple::amdgcn);
+  }
   /// Functions that attempt to perform the combining. They detect if that is
   /// legal, and if so they update the inputs \a Inputs and the offload action
   /// that were collapsed in \a CollapsedOffloadAction. A tool that deals with
@@ -3314,6 +3341,10 @@
     if (!AJ || !BJ || !CJ)
       return nullptr;
 
+    // Cannot combine compilation with backend for amdgcn backend
+    if (isAMDGPUCUDAOffloading(AJ, TC.getTriple()))
+      return nullptr;
+
     // Get compiler tool.
     const Tool *T = TC.SelectTool(*CJ);
     if (!T)
@@ -3345,6 +3376,10 @@
     if (!AJ || !BJ)
       return nullptr;
 
+    // Cannot combine assemble with backend for amdgcn backend
+    if (isAMDGPUCUDAOffloading(AJ, TC.getTriple()))
+      return nullptr;
+
     // Retrieve the compile job, backend action must always be preceded by one.
     ActionList CompileJobOffloadActions;
     auto *CJ = getPrevDependentAction(BJ->getInputs(), CompileJobOffloadActions,
@@ -3378,6 +3413,16 @@
     if (!BJ || !CJ)
       return nullptr;
 
+    // Cannot combine compilation with backend for amdgcn backend. However
+    // it is necessary to combine when generating IR for compile-only with
+    // flags "-c -S -emit-llvm".  If only flags "-c -S" the gcn backend is
+    // needed to generate linked and opt IR for llc, so do not combine.
+    if (isAMDGPUCUDAOffloading(BJ, TC.getTriple()) &&
+        !(C.getArgs().hasArg(options::OPT_c) &&
+          C.getArgs().hasArg(options::OPT_S) &&
+          C.getArgs().hasArg(options::OPT_emit_llvm)))
+      return nullptr;
+
     // Get compiler tool.
     const Tool *T = TC.SelectTool(*CJ);
     if (!T)
@@ -3421,6 +3466,14 @@
         EmbedBitcode(EmbedBitcode) {
     assert(BaseAction && "Invalid base action.");
     IsHostSelector = BaseAction->getOffloadingDeviceKind() == Action::OFK_None;
+    // Store the InputType to check if Compile and Backend can collapse
+    for (Arg *A : C.getInputArgs()) {
+      if (A->getOption().getKind() == Option::InputClass) {
+        const char *Value = A->getValue();
+        if (const char *Ext = strrchr(Value, '.'))
+          InputType = TC.LookupTypeForExtension(Ext + 1);
+      }
+    }
   }
 
   /// Check if a chain of actions can be combined and return the tool that can
@@ -3849,8 +3902,13 @@
       CCGenDiagnostics) {
     StringRef Name = llvm::sys::path::filename(BaseInput);
     std::pair<StringRef, StringRef> Split = Name.split('.');
+    SmallString<128> fname(Split.first.str().c_str());
+    if (!BoundArch.empty()) {
+      fname += "-";
+      fname.append(BoundArch);
+    }
     std::string TmpName = GetTemporaryPath(
-        Split.first, types::getTypeTempSuffix(JA.getType(), IsCLMode()));
+        fname, types::getTypeTempSuffix(JA.getType(), IsCLMode()));
     return C.addTempFile(C.getArgs().MakeArgString(TmpName));
   }
 
@@ -3921,7 +3979,10 @@
         JA.getType() == types::TY_LLVM_BC)
       Suffixed += ".tmp";
     Suffixed += '.';
-    Suffixed += Suffix;
+    if (((StringRef)BaseInput).endswith(".a"))
+      Suffixed += "a";
+    else
+      Suffixed += Suffix;
     NamedOutput = C.getArgs().MakeArgString(Suffixed.c_str());
   }
 
Index: lib/Driver/SanitizerArgs.cpp
===================================================================
--- lib/Driver/SanitizerArgs.cpp
+++ lib/Driver/SanitizerArgs.cpp
@@ -729,7 +729,8 @@
   // NVPTX doesn't currently support sanitizers.  Bailing out here means that
   // e.g. -fsanitize=address applies only to host code, which is what we want
   // for now.
-  if (TC.getTriple().isNVPTX())
+  if (TC.getTriple().isNVPTX() ||
+      TC.getTriple().getArch() == llvm::Triple::amdgcn)
     return;
 
   // Translate available CoverageFeatures to corresponding clang-cc1 flags.
Index: lib/Driver/ToolChain.cpp
===================================================================
--- lib/Driver/ToolChain.cpp
+++ lib/Driver/ToolChain.cpp
@@ -253,6 +253,10 @@
   return Clang.get();
 }
 
+Tool *ToolChain::buildDeviceLibraryLinker() const {
+  return new tools::Clang(*this);
+}
+
 Tool *ToolChain::buildAssembler() const {
   return new tools::ClangAs(*this);
 }
@@ -267,6 +271,12 @@
   return Assemble.get();
 }
 
+Tool *ToolChain::getDeviceLibraryLink() const {
+  if (!DeviceLibraryLink)
+    DeviceLibraryLink.reset(buildDeviceLibraryLinker());
+  return DeviceLibraryLink.get();
+}
+
 Tool *ToolChain::getClangAs() const {
   if (!Assemble)
     Assemble.reset(new tools::ClangAs(*this));
@@ -307,8 +317,9 @@
   case Action::AnalyzeJobClass:
   case Action::MigrateJobClass:
   case Action::VerifyPCHJobClass:
-  case Action::BackendJobClass:
     return getClang();
+  case Action::BackendJobClass:
+    return getDeviceLibraryLink();
 
   case Action::OffloadBundlingJobClass:
   case Action::OffloadUnbundlingJobClass:
@@ -406,8 +417,21 @@
 }
 
 Tool *ToolChain::SelectTool(const JobAction &JA) const {
-  if (getDriver().ShouldUseClangCompiler(JA)) return getClang();
   Action::ActionClass AC = JA.getKind();
+  // The amdgcn Backend needs buildBackend()
+  // if ( StringRef(JA.getOffloadingArch()).startswith("gfx") &&
+  if (JA.isOffloading(Action::OFK_Cuda) &&
+      (StringRef(JA.getOffloadingArch()).startswith("gfx") ||
+       (getTriple().getArch() == llvm::Triple::amdgcn)) &&
+      (AC == Action::BackendJobClass)) {
+    if ((Args.hasArg(options::OPT_emit_llvm)) ||
+        (Args.hasArg(options::OPT_emit_llvm_bc)))
+      return getClang(); // Dont run amdgcn backend if we just want LLVM IR
+    else
+      return getTool(AC);
+  };
+  if (getDriver().ShouldUseClangCompiler(JA))
+    return getClang();
   if (AC == Action::AssembleJobClass && useIntegratedAs())
     return getClangAs();
   return getTool(AC);
Index: lib/Driver/ToolChains/Clang.cpp
===================================================================
--- lib/Driver/ToolChains/Clang.cpp
+++ lib/Driver/ToolChains/Clang.cpp
@@ -2324,9 +2324,10 @@
                              ArgStringList &CmdArgs, bool KernelOrKext) {
   const llvm::Triple &EffectiveTriple = TC.getEffectiveTriple();
 
-  // NVPTX doesn't support stack protectors; from the compiler's perspective, it
-  // doesn't even have a stack!
-  if (EffectiveTriple.isNVPTX())
+  // NVPTX and GCN don't support stack protectors; from the compiler's
+  // perspective, it doesn't even have a stack!
+  if (EffectiveTriple.isNVPTX() ||
+      EffectiveTriple.getArch() == llvm::Triple::amdgcn)
     return;
 
   // -stack-protector=0 is default.
@@ -3496,7 +3497,8 @@
   // Enable -mconstructor-aliases except on darwin, where we have to work around
   // a linker bug (see <rdar://problem/7651567>), and CUDA device code, where
   // aliases aren't supported.
-  if (!RawTriple.isOSDarwin() && !RawTriple.isNVPTX())
+  if (!RawTriple.isOSDarwin() && !RawTriple.isNVPTX() &&
+      RawTriple.getArch() != llvm::Triple::amdgcn)
     CmdArgs.push_back("-mconstructor-aliases");
 
   // Darwin's kernel doesn't support guard variables; just die if we
Index: lib/Driver/ToolChains/Cuda.h
===================================================================
--- lib/Driver/ToolChains/Cuda.h
+++ lib/Driver/ToolChains/Cuda.h
@@ -127,6 +127,53 @@
 };
 
 } // end namespace NVPTX
+
+namespace AMDGCN {
+// Run llc, the AMDGPU assembler.
+class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+public:
+  Assembler(const ToolChain &TC)
+      : Tool("AMDGCN::Assembler", "llc", TC, RF_Full, llvm::sys::WEM_UTF8,
+             "--options-file") {}
+
+  bool hasIntegratedCPP() const override { return false; }
+
+  void ConstructJob(Compilation &C, const JobAction &JA,
+                    const InputInfo &Output, const InputInfoList &Inputs,
+                    const llvm::opt::ArgList &TCArgs,
+                    const char *LinkingOutput) const override;
+};
+
+// Runs clang-offload-bundler, which combines AMDGCN object files into a single
+// output file.
+class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+public:
+  Linker(const ToolChain &TC)
+      : Tool("AMDGCN::Linker", "clang-offload-bundler", TC, RF_Full,
+             llvm::sys::WEM_UTF8, "--options-file") {}
+
+  bool hasIntegratedCPP() const override { return false; }
+
+  void ConstructJob(Compilation &C, const JobAction &JA,
+                    const InputInfo &Output, const InputInfoList &Inputs,
+                    const llvm::opt::ArgList &TCArgs,
+                    const char *LinkingOutput) const override;
+};
+
+// For amdgcn the device library linker is llvm-link + opt.
+class LLVM_LIBRARY_VISIBILITY DeviceLibraryLinker : public Tool {
+public:
+  DeviceLibraryLinker(const ToolChain &TC)
+      : Tool("AMDGCN::DeviceLibraryLinker", "device-library-linker", TC,
+             RF_Full, llvm::sys::WEM_UTF8, "--options-file") {}
+  virtual bool hasIntegratedCPP() const override { return false; }
+  virtual void ConstructJob(Compilation &C, const JobAction &JA,
+                            const InputInfo &Output,
+                            const InputInfoList &Inputs,
+                            const llvm::opt::ArgList &TCArgs,
+                            const char *LinkingOutput) const override;
+};
+} // end namespace AMDGCN
 } // end namespace tools
 
 namespace toolchains {
@@ -184,6 +231,7 @@
   CudaInstallationDetector CudaInstallation;
 
 protected:
+  Tool *buildDeviceLibraryLinker() const override; // for amdgcn, link and opt
   Tool *buildAssembler() const override;  // ptxas
   Tool *buildLinker() const override;     // fatbinary (ok, not really a linker)
 
Index: lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -222,10 +222,48 @@
     IsValid = true;
     break;
   }
+
+  ArgStringList LibraryPaths;
+  for (auto Arg : Args) {
+    if (Arg->getSpelling() == "-L") {
+      llvm::Twine DevicePath = Twine(Arg->getValue()).concat("/libdevice/");
+      if (D.getVFS().exists(DevicePath)) {
+        LibraryPaths.push_back(Args.MakeArgString(DevicePath));
+      } else {
+        if (D.getVFS().exists(Arg->getValue()))
+          LibraryPaths.push_back(Arg->getValue());
+      }
+    }
+  }
+  LibraryPaths.push_back(Args.MakeArgString(D.Dir + "/../lib/libdevice/"));
+
+  // Search for GCN Device Libraries
+  for (Arg *A : Args) {
+    if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ) &&
+        StringRef(A->getValue()).startswith("gfx")) {
+      StringRef GFXNAME = A->getValue();
+      for (auto LP : LibraryPaths) {
+        StringRef GCNPath = Args.MakeArgString(LP + GFXNAME);
+        if (D.getVFS().exists(GCNPath)) {
+          LibDeviceMap[GFXNAME.str()] = GCNPath;
+          break;
+        }
+      }
+    }
+  }
 }
 
 void CudaInstallationDetector::AddCudaIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
+
+  if (DriverArgs.hasArg(options::OPT_x) &&
+      (StringRef(DriverArgs.getLastArg(options::OPT_x)->getValue()) ==
+       "hip")) {
+    // HIP needs c++11.
+    CC1Args.push_back("-std=c++11");
+    // Skip CUDA includes for HIP.
+    return;
+  }
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
     // Add cuda_wrappers/* to our system include path.  This lets us wrap
     // standard library headers.
@@ -273,6 +311,201 @@
        << CudaVersionToString(Version) << "\n";
 }
 
+static bool addBCLib(Compilation &C, const ArgList &Args,
+                     ArgStringList &CmdArgs, ArgStringList LibraryPaths,
+                     const char *BCName) {
+  std::string FullName;
+  bool FoundLibDevice = false;
+  for (std::string LibraryPath : LibraryPaths) {
+    FullName = Args.MakeArgString(LibraryPath + "/" + BCName);
+    if (llvm::sys::fs::exists(FullName.c_str())) {
+      FoundLibDevice = true;
+      break;
+    }
+  }
+  if (!FoundLibDevice)
+    C.getDriver().Diag(diag::err_drv_no_such_file) << BCName;
+  CmdArgs.push_back(Args.MakeArgString(FullName));
+  return FoundLibDevice;
+}
+
+void AMDGCN::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
+                                     const InputInfo &Output,
+                                     const InputInfoList &Inputs,
+                                     const ArgList &Args,
+                                     const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getTriple().getArch() == llvm::Triple::amdgcn && "Wrong platform");
+
+  ArgStringList CmdArgs;
+  for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end();
+       it != ie; ++it) {
+    const InputInfo &II = *it;
+    CmdArgs.push_back(II.getFilename());
+  }
+  CmdArgs.push_back("-mtriple=amdgcn-amd-amdhsa");
+  CmdArgs.push_back("-filetype=obj");
+  std::string GFXNAME = JA.getOffloadingArch();
+  CmdArgs.push_back(Args.MakeArgString("-mcpu=" + GFXNAME));
+  CmdArgs.push_back("-o");
+  std::string TmpName = C.getDriver().GetTemporaryPath("LC_OUTPUT", "o");
+  const char *llcOutputFile =
+      C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str()));
+  CmdArgs.push_back(llcOutputFile);
+  const char *Exec = Args.MakeArgString(C.getDriver().Dir + "/llc");
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+
+  ArgStringList CmdArgs2;
+  CmdArgs2.push_back("-flavor");
+  CmdArgs2.push_back("gnu");
+  CmdArgs2.push_back("--no-undefined");
+  CmdArgs2.push_back("-shared");
+  // The output from ld.lld is an HSA code object file
+  CmdArgs2.push_back("-o");
+  CmdArgs2.push_back(Output.getFilename());
+  CmdArgs2.push_back(llcOutputFile);
+  const char *lld = Args.MakeArgString(C.getDriver().Dir + "/lld");
+  C.addCommand(llvm::make_unique<Command>(JA, *this, lld, CmdArgs2, Inputs));
+  return;
+}
+
+// All inputs to this linker must be from CudaDeviceActions, as we need to look
+// at the Inputs' Actions in order to figure out which GPU architecture they
+// correspond to.
+void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
+                                  const InputInfo &Output,
+                                  const InputInfoList &Inputs,
+                                  const ArgList &Args,
+                                  const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getTriple().getArch() == llvm::Triple::amdgcn && "Wrong platform");
+
+  ArgStringList CmdArgs;
+  CmdArgs.push_back(Args.MakeArgString("-type=o"));
+
+  // ToDo: Remove the dummy host binary entry which is required by
+  // clang-offload-bundler.
+  std::string targets = "-targets=host-x86_64-uknown-linux";
+  std::string inputs = "-inputs=/dev/null";
+  for (const auto &II : Inputs) {
+    if (II.getType() != types::TY_PP_Asm) {
+      // ToDo: Teach clang-offload-bundler to recognize hip.
+      targets = targets + ",openmp-amdgcn--amdhsa-" +
+                StringRef(II.getAction()->getOffloadingArch()).str();
+      inputs = inputs + "," + II.getFilename();
+    }
+  }
+  CmdArgs.push_back(Args.MakeArgString(targets));
+  CmdArgs.push_back(Args.MakeArgString(inputs));
+
+  auto outputArgString =
+      Args.MakeArgString(std::string("-outputs=").append(Output.getFilename()));
+  CmdArgs.push_back(outputArgString);
+
+  const char *Exec =
+      Args.MakeArgString(C.getDriver().Dir + "/clang-offload-bundler");
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+  return;
+}
+
+void AMDGCN::DeviceLibraryLinker::ConstructJob(
+    Compilation &C, const JobAction &JA, const InputInfo &Output,
+    const InputInfoList &Inputs, const ArgList &Args,
+    const char *LinkingOutput) const {
+
+  assert(StringRef(JA.getOffloadingArch()).startswith("gfx") &&
+         " unless gfx processor, backend should be clang");
+
+  // For amdgcn the Backend Job will call llvm-link & opt steps
+  ArgStringList CmdArgs;
+  // Add the input bc's created by compile step
+  for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end();
+       it != ie; ++it) {
+    const InputInfo &II = *it;
+    CmdArgs.push_back(II.getFilename());
+  }
+
+  std::string GFXNAME = JA.getOffloadingArch();
+
+  ArgStringList LibraryPaths;
+
+  // Find in -L<path> and LIBRARY_PATH.
+  for (auto Arg : Args) {
+    if (Arg->getSpelling() == "-L") {
+      LibraryPaths.push_back(Args.MakeArgString(
+          std::string(Arg->getValue()) + "/libdevice/" + std::string(GFXNAME)));
+      LibraryPaths.push_back(Args.MakeArgString(Arg->getValue()));
+    }
+  }
+
+  // add the compiler installation libdevice last so -L will override them.
+  LibraryPaths.push_back(Args.MakeArgString(
+      C.getDriver().Dir + "/../lib/libdevice/" + std::string(GFXNAME)));
+
+  addDirectoryList(Args, LibraryPaths, "-L", "LIBRARY_PATH");
+
+  addBCLib(C, Args, CmdArgs, LibraryPaths, "libhiprt.bc");
+  addBCLib(C, Args, CmdArgs, LibraryPaths, "opencl.amdgcn.bc");
+  addBCLib(C, Args, CmdArgs, LibraryPaths, "ockl.amdgcn.bc");
+  addBCLib(C, Args, CmdArgs, LibraryPaths, "irif.amdgcn.bc");
+  addBCLib(C, Args, CmdArgs, LibraryPaths, "ocml.amdgcn.bc");
+  addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_finite_only_off.amdgcn.bc");
+  addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_daz_opt_off.amdgcn.bc");
+  addBCLib(C, Args, CmdArgs, LibraryPaths,
+           "oclc_correctly_rounded_sqrt_on.amdgcn.bc");
+  addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_unsafe_math_off.amdgcn.bc");
+  addBCLib(C, Args, CmdArgs, LibraryPaths, "hc.amdgcn.bc");
+  addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_isa_version.amdgcn.bc");
+
+  CmdArgs.push_back("-suppress-warnings");
+
+  // Add an intermediate output file which is input to opt
+  CmdArgs.push_back("-o");
+  std::string TmpName = C.getDriver().GetTemporaryPath("OPT_INPUT", "bc");
+  const char *ResultingBitcodeF =
+      C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str()));
+  CmdArgs.push_back(ResultingBitcodeF);
+  const char *Exec = Args.MakeArgString(C.getDriver().Dir + "/llvm-link");
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+
+  ArgStringList OptArgs;
+  // The input to opt is the output from llvm-link.
+  OptArgs.push_back(ResultingBitcodeF);
+  // Pass optimization arg to opt.
+  if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
+    StringRef OOpt = "3";
+    if (A->getOption().matches(options::OPT_O4) ||
+        A->getOption().matches(options::OPT_Ofast))
+      OOpt = "3";
+    else if (A->getOption().matches(options::OPT_O0))
+      OOpt = "0";
+    else if (A->getOption().matches(options::OPT_O)) {
+      // -Os, -Oz, and -O(anything else) map to -O2
+      OOpt = llvm::StringSwitch<const char *>(A->getValue())
+                 .Case("1", "1")
+                 .Case("2", "2")
+                 .Case("3", "3")
+                 .Case("s", "2")
+                 .Case("z", "2")
+                 .Default("2");
+    }
+    OptArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
+
+    OptArgs.push_back("-S");
+    const char *mcpustr = Args.MakeArgString("-mcpu=" + GFXNAME);
+    OptArgs.push_back(mcpustr);
+    OptArgs.push_back("-dce");
+    OptArgs.push_back("-sroa");
+    OptArgs.push_back("-globaldce");
+  }
+  OptArgs.push_back("-o");
+  OptArgs.push_back(Output.getFilename());
+  const char *OptExec = Args.MakeArgString(C.getDriver().Dir + "/opt");
+  C.addCommand(llvm::make_unique<Command>(JA, *this, OptExec, OptArgs, Inputs));
+}
+
 void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                     const InputInfo &Output,
                                     const InputInfoList &Inputs,
@@ -563,10 +796,17 @@
         DriverArgs.hasArg(options::OPT_S))
       return;
 
-    getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
+    if (GpuArch.startswith("gfx"))
+      getDriver().Diag(diag::err_drv_no_hip_libdevice) << GpuArch;
+    else
+      getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
     return;
   }
 
+  // Do not add -link-cuda-bitcode or ptx42 features if gfx
+  if (GpuArch.startswith("gfx"))
+    return;
+
   CC1Args.push_back("-mlink-cuda-bitcode");
   CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
@@ -709,13 +949,21 @@
   return DAL;
 }
 
+Tool *CudaToolChain::buildDeviceLibraryLinker() const {
+  return new tools::AMDGCN::DeviceLibraryLinker(*this);
+}
+
 Tool *CudaToolChain::buildAssembler() const {
+  if (getTriple().getArch() == llvm::Triple::amdgcn)
+    return new tools::AMDGCN::Assembler(*this);
   return new tools::NVPTX::Assembler(*this);
 }
 
 Tool *CudaToolChain::buildLinker() const {
   if (OK == Action::OFK_OpenMP)
     return new tools::NVPTX::OpenMPLinker(*this);
+  if (getTriple().getArch() == llvm::Triple::amdgcn)
+    return new tools::AMDGCN::Linker(*this);
   return new tools::NVPTX::Linker(*this);
 }
 
Index: lib/Driver/ToolChains/Gnu.cpp
===================================================================
--- lib/Driver/ToolChains/Gnu.cpp
+++ lib/Driver/ToolChains/Gnu.cpp
@@ -471,6 +471,12 @@
   // The profile runtime also needs access to system libraries.
   getToolChain().addProfileRTLibs(Args, CmdArgs);
 
+  // The hip runtime is installed in lib dir of compiler installation.
+  if (Args.hasArg(options::OPT_x) &&
+      StringRef(Args.getLastArg(options::OPT_x)->getValue()) == "hip") {
+    CmdArgs.push_back("-lhip_hcc");
+  }
+
   if (D.CCCIsCXX() &&
       !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
     if (ToolChain.ShouldLinkCXXStdlib(Args)) {
Index: lib/Driver/Types.cpp
===================================================================
--- lib/Driver/Types.cpp
+++ lib/Driver/Types.cpp
@@ -102,6 +102,9 @@
   case TY_CL:
   case TY_CUDA: case TY_PP_CUDA:
   case TY_CUDA_DEVICE:
+  case TY_HIP:
+  case TY_PP_HIP:
+  case TY_HIP_DEVICE:
   case TY_ObjC: case TY_PP_ObjC: case TY_PP_ObjC_Alias:
   case TY_CXX: case TY_PP_CXX:
   case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias:
@@ -141,6 +144,9 @@
   case TY_ObjCXXHeader: case TY_PP_ObjCXXHeader:
   case TY_CXXModule: case TY_PP_CXXModule:
   case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE:
+  case TY_HIP:
+  case TY_PP_HIP:
+  case TY_HIP_DEVICE:
     return true;
   }
 }
@@ -166,6 +172,9 @@
   case TY_CUDA:
   case TY_PP_CUDA:
   case TY_CUDA_DEVICE:
+  case TY_HIP:
+  case TY_PP_HIP:
+  case TY_HIP_DEVICE:
     return true;
   }
 }
Index: lib/Frontend/CompilerInstance.cpp
===================================================================
--- lib/Frontend/CompilerInstance.cpp
+++ lib/Frontend/CompilerInstance.cpp
@@ -410,7 +410,8 @@
   // triple (the host triple) to initialize our header search, since we need to
   // find the host headers in order to compile the CUDA code.
   const llvm::Triple *HeaderSearchTriple = &PP->getTargetInfo().getTriple();
-  if (PP->getTargetInfo().getTriple().getOS() == llvm::Triple::CUDA &&
+  if ((PP->getTargetInfo().getTriple().getOS() == llvm::Triple::CUDA ||
+       PP->getTargetInfo().getTriple().getOS() == llvm::Triple::AMDHSA) &&
       PP->getAuxTargetInfo())
     HeaderSearchTriple = &PP->getAuxTargetInfo()->getTriple();
 
Index: test/Driver/cuda-phases.cu
===================================================================
--- test/Driver/cuda-phases.cu
+++ test/Driver/cuda-phases.cu
@@ -7,24 +7,29 @@
 // REQUIRES: clang-driver
 // REQUIRES: powerpc-registered-target
 // REQUIRES: nvptx-registered-target
-
+// REQUIRES: amdgpu-registered-target
 //
 // Test single gpu architecture with complete compilation.
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \
-// RUN: | FileCheck -check-prefix=BIN %s
-// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s
+// BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-cuda)
+// BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-cuda)
+// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda)
 // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
-// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
-// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
-// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
-// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
-// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
+// BIN_NV-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH:sm_30]])
+// BIN_AMD-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH:gfx803]])
+// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]])
+// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, [[ARCH]])
+// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, [[ARCH]])
+// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, [[ARCH]])
+// BIN_NV-DAG: [[P8:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P7]]}, object
+// BIN_AMD-DAG: [[P8:[0-9]+]]: offload, "device-cuda ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P7]]}, object
+// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH]])" {[[P6]]}, assembler
 // BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda)
-// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir
+// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda ([[TRIPLE]])" {[[P10]]}, ir
 // BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda)
 // BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-cuda)
 // BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-cuda)
@@ -34,13 +39,15 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \
 // RUN: | FileCheck -check-prefix=ASM %s
-// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
-// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda)
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefix=ASM %s
+// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]])
+// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]])
+// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]])
+// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]])
+// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
+// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-cuda)
+// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (host-cuda)
 // ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda)
 // ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-cuda)
 
@@ -49,25 +56,27 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
 // RUN: | FileCheck -check-prefix=BIN2 %s
-// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \
+// RUN: | FileCheck -check-prefix=BIN2 %s
+// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda)
+// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda)
 // BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
-// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
-// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
-// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
-// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
-// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
-// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35)
-// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35)
-// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35)
-// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35)
-// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object
-// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler
+// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH1:sm_30|gfx803]])
+// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-cuda, [[ARCH1]])
+// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, [[ARCH1]])
+// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, [[ARCH1]])
+// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, [[ARCH1]])
+// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P7]]}, object
+// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH1]])" {[[P6]]}, assembler
+// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH2:sm_35|gfx900]])
+// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (device-cuda, [[ARCH2]])
+// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, [[ARCH2]])
+// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, [[ARCH2]])
+// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, [[ARCH2]])
+// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH2]])" {[[P14]]}, object
+// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH2]])" {[[P13]]}, assembler
 // BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda)
-// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir
+// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda ([[TRIPLE]])" {[[P17]]}, ir
 // BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda)
 // BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-cuda)
 // BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-cuda)
@@ -77,18 +86,20 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
 // RUN: | FileCheck -check-prefix=ASM2 %s
-// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
-// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
-// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
-// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
-// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
-// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda)
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefix=ASM2 %s
+// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH1:sm_30|gfx803]])
+// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH1]])
+// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH1]])
+// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH1]])
+// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P3]]}, assembler
+// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH2:sm_35|gfx900]])
+// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-cuda, [[ARCH2]])
+// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, [[ARCH2]])
+// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, [[ARCH2]])
+// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler
+// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-cuda)
+// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (host-cuda)
 // ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda)
 // ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-cuda)
 
@@ -98,8 +109,10 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
 // RUN: | FileCheck -check-prefix=HBIN %s
-// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefix=HBIN %s
+// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda)
+// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda)
 // HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
 // HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
 // HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
@@ -110,8 +123,10 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
 // RUN: | FileCheck -check-prefix=HASM %s
-// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=HASM %s
+// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda)
+// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda)
 // HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
 // HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
 
@@ -121,8 +136,10 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
 // RUN: | FileCheck -check-prefix=HBIN2 %s
-// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefix=HBIN2 %s
+// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda)
+// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda)
 // HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
 // HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
 // HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
@@ -134,8 +151,10 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \
 // RUN: | FileCheck -check-prefix=HASM2 %s
-// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=HASM2 %s
+// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda)
+// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda)
 // HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
 // HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
 
@@ -145,12 +164,14 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
 // RUN: | FileCheck -check-prefix=DBIN %s
-// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
-// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefix=DBIN %s
+// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]])
+// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]])
+// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]])
+// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]])
+// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, [[ARCH]])
+// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P4]]}, object
 
 //
 // Test single gpu architecture up to the assemble phase in device-only
@@ -158,11 +179,13 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
 // RUN: | FileCheck -check-prefix=DASM %s
-// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=DASM %s
+// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]])
+// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]])
+// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]])
+// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]])
+// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
 
 //
 // Test two gpu architectures with complete compilation in device-only
@@ -170,18 +193,20 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
 // RUN: | FileCheck -check-prefix=DBIN2 %s
-// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
-// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
-// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35)
-// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35)
-// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35)
-// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35)
-// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefix=DBIN2 %s
+// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]])
+// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]])
+// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]])
+// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]])
+// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, [[ARCH]])
+// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P4]]}, object
+// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH2:sm_35|gfx900]])
+// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, [[T]]-cpp-output, (device-cuda, [[ARCH2]])
+// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, [[ARCH2]])
+// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, [[ARCH2]])
+// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, [[ARCH2]])
+// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH2]])" {[[P10]]}, object
 
 //
 // Test two gpu architectures up to the assemble phase in device-only
@@ -189,13 +214,15 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
 // RUN: | FileCheck -check-prefix=DASM2 %s
-// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
-// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
-// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
-// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
-// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=DASM2 %s
+// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]])
+// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]])
+// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]])
+// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]])
+// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
+// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH2:sm_35|gfx900]])
+// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-cuda, [[ARCH2]])
+// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, [[ARCH2]])
+// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, [[ARCH2]])
+// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler