diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -121,6 +121,9 @@
   "invalid argument in '%0', only integer or 'auto' is supported">;
 def err_drv_missing_argument : Error<
   "argument to '%0' is missing (expected %1 value%s1)">;
+def err_drv_missing_Xopenmptarget_or_march: Error<
+  "The option -fopenmp-targets= requires additional options -Xopenmp-target= and -march= .">,
+  DefaultFatal;
 def err_drv_invalid_Xarch_argument_with_args : Error<
   "invalid Xarch argument: '%0', options requiring arguments are unsupported">;
 def err_drv_Xopenmp_target_missing_triple : Error<
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -170,6 +170,9 @@
   mutable llvm::Optional<RuntimeLibType> runtimeLibType;
   mutable llvm::Optional<UnwindLibType> unwindLibType;
 
+  // OpenMP creates a toolchain for each target arch. eg - gfx908
+  std::string OffloadArch;
+
 protected:
   MultilibSet Multilibs;
   Multilib SelectedMultilib;
@@ -246,6 +249,12 @@
     return EffectiveTriple;
   }
 
+  const std::string getOffloadArch() const { return OffloadArch; }
+
+  void setOffloadArch(std::string OffloadArch) {
+    this->OffloadArch = std::move(OffloadArch);
+  }
+
   path_list &getLibraryPaths() { return LibraryPaths; }
   const path_list &getLibraryPaths() const { return LibraryPaths; }
 
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -206,11 +206,23 @@
                              const DeviceDependences &DDeps)
     : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()),
       DevToolChains(DDeps.getToolChains()) {
-  // We use the kinds of the host dependence for this action.
-  OffloadingArch = HDep.getBoundArch();
+  auto &OKinds = DDeps.getOffloadKinds();
+  auto &BArchs = DDeps.getBoundArchs();
+
+  // If all inputs agree on the same kind, use it also for this action.
+  if (llvm::all_of(OKinds, [&](OffloadKind K) { return K == OKinds.front(); }))
+    OffloadingDeviceKind = OKinds.front();
+
+  // If we have a single dependency, inherit the architecture from it.
+  if (OKinds.size() == 1)
+    OffloadingArch = BArchs.front();
+  else
+    // We use the kinds of the host dependence for this action.
+    OffloadingArch = HDep.getBoundArch();
+
   ActiveOffloadKindMask = HDep.getOffloadKinds();
   HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(),
-                                             HDep.getBoundArch());
+                                             OffloadingArch);
 
   // Add device inputs and propagate info to the device actions. Do work only if
   // we have dependencies.
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -678,6 +678,38 @@
   return RT;
 }
 
+bool GetTargetInfoFromMArch(Compilation &C,
+                            std::set<std::string> &OffloadArchs) {
+  StringRef OpenMPTargetArch;
+  for (Arg *A : C.getInputArgs()) {
+    if (A->getOption().matches(options::OPT_Xopenmp_target_EQ)) {
+      for (auto *V : A->getValues()) {
+        StringRef VStr = StringRef(V);
+        if (VStr.startswith("-march=") || VStr.startswith("--march=")) {
+          OpenMPTargetArch = VStr.split('=').second;
+          CudaArch Arch = StringToCudaArch(StringRef(OpenMPTargetArch));
+          if (Arch == CudaArch::UNKNOWN) {
+            C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch)
+                << OpenMPTargetArch;
+            C.setContainsError();
+            return false;
+          }
+          StringRef OpenMPTargetTriple = StringRef(A->getValue(0));
+          llvm::Triple TargetTriple(OpenMPTargetTriple);
+
+          // Append Triple and Arch to form a unique key for each instance of
+          // the ToolChain
+          if (!OpenMPTargetTriple.empty() && !OpenMPTargetArch.empty())
+            OffloadArchs.insert(TargetTriple.normalize().append("^").append(
+                OpenMPTargetArch.str()));
+        }
+        A->claim();
+      }
+    }
+  }
+  return true;
+}
+
 void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
                                               InputList &Inputs) {
 
@@ -729,17 +761,58 @@
           *this, HIPTriple, *HostTC, C.getInputArgs());
     }
     C.addOffloadDeviceToolChain(HIPTC.get(), OFK);
-  }
+  } else {
+    //
+    // OpenMP
+    //
 
-  //
-  // OpenMP
-  //
-  // We need to generate an OpenMP toolchain if the user specified targets with
-  // the -fopenmp-targets option.
-  if (Arg *OpenMPTargets =
-          C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
-    if (OpenMPTargets->getNumValues()) {
-      // We expect that -fopenmp-targets is always used in conjunction with the
+    std::set<std::string> OffloadArchs;
+
+    if (Arg *OpenMPTargets =
+            C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
+
+      if (!OpenMPTargets->getNumValues()) {
+        Diag(clang::diag::warn_drv_empty_joined_argument)
+            << OpenMPTargets->getAsString(C.getInputArgs());
+        return;
+      }
+
+      // First, handle errors in command line for OpenMP target offload
+      bool is_host_offloading =
+          (OpenMPTargets->getNumValues() == 1) &&
+          StringRef(OpenMPTargets->getValue())
+              .startswith_insensitive(
+                  C.getSingleOffloadToolChain<Action::OFK_Host>()
+                      ->getTriple()
+                      .getArchName());
+      if (!is_host_offloading) {
+        // Ensure at least one -Xopenm-target exists with a gpu -march
+        if (Arg *XOpenMPTargets =
+                C.getInputArgs().getLastArg(options::OPT_Xopenmp_target_EQ)) {
+          bool has_valid_march = false;
+          for (auto *V : XOpenMPTargets->getValues())
+            if (StringRef(V).startswith("-march=") ||
+                StringRef(V).startswith("--march="))
+              has_valid_march = true;
+          if (!has_valid_march) {
+            Diag(diag::err_drv_missing_Xopenmptarget_or_march);
+            return;
+          }
+        } else {
+          Diag(diag::err_drv_missing_Xopenmptarget_or_march);
+          return;
+        }
+      }
+
+      //  process legacy option -fopenmp-targets -Xopenmp-target and -march
+      auto status = GetTargetInfoFromMArch(C, OffloadArchs);
+      if (!status)
+        return;
+    }
+
+    if (!OffloadArchs.empty()) {
+
+      // We expect that an offload target is always used in conjunction with
       // option -fopenmp specifying a valid runtime with offloading support,
       // i.e. libomp or libiomp.
       bool HasValidOpenMPRuntime = C.getInputArgs().hasFlag(
@@ -750,61 +823,65 @@
         HasValidOpenMPRuntime =
             OpenMPKind == OMPRT_OMP || OpenMPKind == OMPRT_IOMP5;
       }
+      if (!HasValidOpenMPRuntime) {
+        Diag(clang::diag::err_drv_expecting_fopenmp_with_fopenmp_targets);
+        return;
+      }
 
-      if (HasValidOpenMPRuntime) {
-        llvm::StringMap<const char *> FoundNormalizedTriples;
-        for (const char *Val : OpenMPTargets->getValues()) {
-          llvm::Triple TT(Val);
-          std::string NormalizedName = TT.normalize();
-
-          // Make sure we don't have a duplicate triple.
-          auto Duplicate = FoundNormalizedTriples.find(NormalizedName);
-          if (Duplicate != FoundNormalizedTriples.end()) {
-            Diag(clang::diag::warn_drv_omp_offload_target_duplicate)
-                << Val << Duplicate->second;
-            continue;
-          }
+      llvm::StringMap<const char *> FoundNormalizedTriples;
+      for (auto &Target : OffloadArchs) {
+        size_t Loc = Target.find('^');
+        std::string TripleStr = Target.substr(0, Loc);
+        std::string OpenMPTargetArch = Target.substr(Loc + 1);
+        llvm::Triple TT(TripleStr);
+        std::string NormalizedName = Target;
+
+        // Make sure we don't have a duplicate triple.
+        auto Duplicate = FoundNormalizedTriples.find(NormalizedName);
+        if (Duplicate != FoundNormalizedTriples.end()) {
+          Diag(clang::diag::warn_drv_omp_offload_target_duplicate)
+              << NormalizedName << Duplicate->second;
+          continue;
+        }
+
+        // Store the current triple so that we can check for duplicates in the
+        // following iterations.
+        FoundNormalizedTriples[NormalizedName] = NormalizedName.c_str();
 
-          // Store the current triple so that we can check for duplicates in the
-          // following iterations.
-          FoundNormalizedTriples[NormalizedName] = Val;
-
-          // If the specified target is invalid, emit a diagnostic.
-          if (TT.getArch() == llvm::Triple::UnknownArch)
-            Diag(clang::diag::err_drv_invalid_omp_target) << Val;
-          else {
-            const ToolChain *TC;
-            // Device toolchains have to be selected differently. They pair host
-            // and device in their implementation.
-            if (TT.isNVPTX() || TT.isAMDGCN()) {
-              const ToolChain *HostTC =
-                  C.getSingleOffloadToolChain<Action::OFK_Host>();
-              assert(HostTC && "Host toolchain should be always defined.");
-              auto &DeviceTC =
-                  ToolChains[TT.str() + "/" + HostTC->getTriple().normalize()];
-              if (!DeviceTC) {
-                if (TT.isNVPTX())
-                  DeviceTC = std::make_unique<toolchains::CudaToolChain>(
-                      *this, TT, *HostTC, C.getInputArgs(), Action::OFK_OpenMP);
-                else if (TT.isAMDGCN())
-                  DeviceTC =
-                      std::make_unique<toolchains::AMDGPUOpenMPToolChain>(
-                          *this, TT, *HostTC, C.getInputArgs());
-                else
-                  assert(DeviceTC && "Device toolchain not defined.");
-              }
-
-              TC = DeviceTC.get();
-            } else
-              TC = &getToolChain(C.getInputArgs(), TT);
-            C.addOffloadDeviceToolChain(TC, Action::OFK_OpenMP);
+        // If the specified target is invalid, emit a diagnostic.
+        if (TT.getArch() == llvm::Triple::UnknownArch) {
+          Diag(clang::diag::err_drv_invalid_omp_target) << NormalizedName;
+          return;
+        }
+
+        const ToolChain *TC;
+        // Device toolchains have to be selected differently. They pair host
+        // and device in their implementation.
+        if (TT.isNVPTX() || TT.isAMDGCN()) {
+          const ToolChain *HostTC =
+              C.getSingleOffloadToolChain<Action::OFK_Host>();
+          assert(HostTC && "Host toolchain should be always defined.");
+          auto &DeviceTC = ToolChains[NormalizedName + "/" +
+                                      HostTC->getTriple().normalize()];
+          if (!DeviceTC) {
+            if (TT.isNVPTX())
+              DeviceTC = std::make_unique<toolchains::CudaToolChain>(
+                  *this, TT, *HostTC, C.getInputArgs(), Action::OFK_OpenMP,
+                  OpenMPTargetArch);
+            else if (TT.isAMDGCN())
+              DeviceTC = std::make_unique<toolchains::AMDGPUOpenMPToolChain>(
+                  *this, TT, *HostTC, C.getInputArgs(), OpenMPTargetArch);
+            else
+              assert(DeviceTC && "Device toolchain not defined.");
           }
+          TC = DeviceTC.get();
+        } else {
+          TC = &getToolChain(C.getInputArgs(), TT);
         }
-      } else
-        Diag(clang::diag::err_drv_expecting_fopenmp_with_fopenmp_targets);
-    } else
-      Diag(clang::diag::warn_drv_empty_joined_argument)
-          << OpenMPTargets->getAsString(C.getInputArgs());
+        // Each value of -fopenmp-targets gets instance of offload toolchain
+        C.addOffloadDeviceToolChain(TC, Action::OFK_OpenMP);
+      } // end foreach openmp target
+    }   // end has openmp offload targets
   }
 
   //
@@ -2406,6 +2483,19 @@
       ABRT_Ignore_Host,
     };
 
+    /// ID to identify each device compilation. For CUDA it is simply the
+    /// GPU arch string. For HIP it is either the GPU arch string or GPU
+    /// arch string plus feature strings delimited by a plus sign, e.g.
+    /// gfx906+xnack.
+    struct TargetID {
+      /// Target ID string which is persistent throughout the compilation.
+      const char *ID;
+      TargetID(CudaArch Arch) { ID = CudaArchToString(Arch); }
+      TargetID(const char *ID) : ID(ID) {}
+      operator const char *() { return ID; }
+      operator StringRef() { return StringRef(ID); }
+    };
+
   protected:
     /// Compilation associated with this builder.
     Compilation &C;
@@ -2487,18 +2577,6 @@
     bool EmitLLVM = false;
     bool EmitAsm = false;
 
-    /// ID to identify each device compilation. For CUDA it is simply the
-    /// GPU arch string. For HIP it is either the GPU arch string or GPU
-    /// arch string plus feature strings delimited by a plus sign, e.g.
-    /// gfx906+xnack.
-    struct TargetID {
-      /// Target ID string which is persistent throughout the compilation.
-      const char *ID;
-      TargetID(CudaArch Arch) { ID = CudaArchToString(Arch); }
-      TargetID(const char *ID) : ID(ID) {}
-      operator const char *() { return ID; }
-      operator StringRef() { return StringRef(ID); }
-    };
     /// List of GPU architectures to use in this compilation.
     SmallVector<TargetID, 4> GpuArchList;
 
@@ -3121,6 +3199,12 @@
     /// The OpenMP actions for the current input.
     ActionList OpenMPDeviceActions;
 
+    bool CompileHostOnly = false;
+    bool CompileDeviceOnly = false;
+
+    /// List of GPU architectures to use in this compilation.
+    SmallVector<TargetID, 4> GpuArchList;
+
     /// The linker inputs obtained for each toolchain.
     SmallVector<ActionList, 8> DeviceLinkerInputs;
 
@@ -3154,14 +3238,26 @@
         // We passed the device action as a host dependence, so we don't need to
         // do anything else with them.
         OpenMPDeviceActions.clear();
-        return ABRT_Success;
+        return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
+        ;
       }
 
+      bool LastActionIsCompile = false;
       // By default, we produce an action for each device arch.
-      for (Action *&A : OpenMPDeviceActions)
-        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);
-
-      return ABRT_Success;
+      for (unsigned I = 0; I < ToolChains.size(); ++I) {
+        Action *&A = OpenMPDeviceActions[I];
+        // AMDGPU does not support linking of object files, so we skip
+        // assemble and backend actions to produce LLVM IR.
+        if (ToolChains[I]->getTriple().isAMDGCN() &&
+            (CurPhase == phases::Assemble || CurPhase == phases::Backend))
+          continue;
+        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A,
+                                               Action::OFK_OpenMP);
+        LastActionIsCompile =
+            (A->getKind() == Action::ActionClass::CompileJobClass);
+      }
+      return (CompileDeviceOnly && LastActionIsCompile) ? ABRT_Ignore_Host
+                                                        : ABRT_Success;
     }
 
     ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
@@ -3169,9 +3265,15 @@
       // If this is an input action replicate it for each OpenMP toolchain.
       if (auto *IA = dyn_cast<InputAction>(HostAction)) {
         OpenMPDeviceActions.clear();
-        for (unsigned I = 0; I < ToolChains.size(); ++I)
-          OpenMPDeviceActions.push_back(
-              C.MakeAction<InputAction>(IA->getInputArg(), IA->getType()));
+        // Only process input actions for files that have extensions
+        std::string FileName = IA->getInputArg().getAsString(Args);
+        if (!llvm::sys::path::has_extension(FileName)) {
+          return ABRT_Inactive;
+        }
+        for (unsigned I = 0; I < ToolChains.size(); ++I) {
+          OpenMPDeviceActions.push_back(C.MakeAction<InputAction>(
+              IA->getInputArg(), IA->getType(), GpuArchList[I].ID));
+        }
         return ABRT_Success;
       }
 
@@ -3191,8 +3293,9 @@
           return ABRT_Inactive;
         for (unsigned I = 0; I < ToolChains.size(); ++I) {
           OpenMPDeviceActions.push_back(UA);
-          UA->registerDependentActionInfo(
-              ToolChains[I], /*BoundArch=*/StringRef(), Action::OFK_OpenMP);
+          UA->registerDependentActionInfo(ToolChains[I],
+                                          /*BoundArch=*/GpuArchList[I].ID,
+                                          Action::OFK_OpenMP);
         }
         return ABRT_Success;
       }
@@ -3209,10 +3312,11 @@
             *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
             /*BoundArch=*/nullptr, Action::OFK_OpenMP);
         auto TC = ToolChains.begin();
+        unsigned arch_count = 0;
         for (Action *&A : OpenMPDeviceActions) {
           assert(isa<CompileJobAction>(A));
           OffloadAction::DeviceDependences DDep;
-          DDep.add(*A, **TC, /*BoundArch=*/nullptr, Action::OFK_OpenMP);
+          DDep.add(*A, **TC, GpuArchList[arch_count++].ID, Action::OFK_OpenMP);
           A = C.MakeAction<OffloadAction>(HDep, DDep);
           ++TC;
         }
@@ -3228,11 +3332,13 @@
       assert(OpenMPDeviceActions.size() == ToolChains.size() &&
              "Number of OpenMP actions and toolchains do not match.");
 
+      unsigned arch_count = 0;
       // Append all device actions followed by the proper offload action.
       auto TI = ToolChains.begin();
       for (auto *A : OpenMPDeviceActions) {
         OffloadAction::DeviceDependences Dep;
-        Dep.add(*A, **TI, /*BoundArch=*/nullptr, Action::OFK_OpenMP);
+        Dep.add(*A, **TI, /*BoundArch=*/GpuArchList[arch_count++].ID,
+                Action::OFK_OpenMP);
         AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
         ++TI;
       }
@@ -3243,17 +3349,17 @@
     void appendLinkDeviceActions(ActionList &AL) override {
       assert(ToolChains.size() == DeviceLinkerInputs.size() &&
              "Toolchains and linker inputs sizes do not match.");
-
       // Append a new link action for each device.
       auto TC = ToolChains.begin();
+      unsigned arch_count = 0;
       for (auto &LI : DeviceLinkerInputs) {
         auto *DeviceLinkAction =
             C.MakeAction<LinkJobAction>(LI, types::TY_Image);
         OffloadAction::DeviceDependences DeviceLinkDeps;
-        DeviceLinkDeps.add(*DeviceLinkAction, **TC, /*BoundArch=*/nullptr,
-		        Action::OFK_OpenMP);
+        DeviceLinkDeps.add(*DeviceLinkAction, **TC,
+                           GpuArchList[arch_count++].ID, Action::OFK_OpenMP);
         AL.push_back(C.MakeAction<OffloadAction>(DeviceLinkDeps,
-            DeviceLinkAction->getType()));
+                                                 DeviceLinkAction->getType()));
         ++TC;
       }
       DeviceLinkerInputs.clear();
@@ -3270,12 +3376,21 @@
     void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {}
 
     bool initialize() override {
+      if (Arg *cu_dev_only =
+              C.getInputArgs().getLastArg(options::OPT_cuda_device_only)) {
+        cu_dev_only->claim();
+        CompileDeviceOnly = true;
+        // TODO: Check emitting IR for OpenMP when cuda-device-only is set
+      }
       // Get the OpenMP toolchains. If we don't get any, the action builder will
       // know there is nothing to do related to OpenMP offloading.
       auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
       for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE;
-           ++TI)
+           ++TI) {
+        GpuArchList.push_back(
+            TI->second->getTriple().getEnvironmentName().data());
         ToolChains.push_back(TI->second);
+      }
 
       DeviceLinkerInputs.resize(ToolChains.size());
       return false;
@@ -4593,6 +4708,7 @@
     OA->doOnEachDependence(
         /*IsHostDependence=*/BuildingForOffloadDevice,
         [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+
           OffloadDependencesInputInfo.push_back(BuildJobsForAction(
               C, DepA, DepTC, DepBoundArch, /*AtTopLevel=*/false,
               /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults,
@@ -4645,25 +4761,6 @@
   if (!T)
     return InputInfo();
 
-  if (BuildingForOffloadDevice &&
-      A->getOffloadingDeviceKind() == Action::OFK_OpenMP) {
-    if (TC->getTriple().isAMDGCN()) {
-      // AMDGCN treats backend and assemble actions as no-op because
-      // linker does not support object files.
-      if (const BackendJobAction *BA = dyn_cast<BackendJobAction>(A)) {
-        return BuildJobsForAction(C, *BA->input_begin(), TC, BoundArch,
-                                  AtTopLevel, MultipleArchs, LinkingOutput,
-                                  CachedResults, TargetDeviceOffloadKind);
-      }
-
-      if (const AssembleJobAction *AA = dyn_cast<AssembleJobAction>(A)) {
-        return BuildJobsForAction(C, *AA->input_begin(), TC, BoundArch,
-                                  AtTopLevel, MultipleArchs, LinkingOutput,
-                                  CachedResults, TargetDeviceOffloadKind);
-      }
-    }
-  }
-
   // If we've collapsed action list that contained OffloadAction we
   // need to build jobs for host/device-side inputs it may have held.
   for (const auto *OA : CollapsedOffloadActions)
@@ -4747,17 +4844,23 @@
                                  UI.DependentOffloadKind == Action::OFK_HIP,
                              OffloadingPrefix),
           BaseInput);
+      if (UI.DependentOffloadKind == Action::OFK_Host &&
+          llvm::sys::path::extension(InputInfos[0].getFilename()) == ".a")
+        CurI = InputInfos[0];
       // Save the unbundling result.
       UnbundlingResults.push_back(CurI);
 
       // Get the unique string identifier for this dependence and cache the
       // result.
       StringRef Arch;
-      if (TargetDeviceOffloadKind == Action::OFK_HIP) {
+      if (TargetDeviceOffloadKind == Action::OFK_HIP ||
+          TargetDeviceOffloadKind == Action::OFK_OpenMP) {
         if (UI.DependentOffloadKind == Action::OFK_Host)
           Arch = StringRef();
-        else
+        else if (TargetDeviceOffloadKind == Action::OFK_HIP)
           Arch = UI.DependentBoundArch;
+        else if (TargetDeviceOffloadKind == Action::OFK_OpenMP)
+          Arch = UI.DependentToolChain->getOffloadArch();
       } else
         Arch = BoundArch;
 
@@ -4787,8 +4890,9 @@
         BaseInput = FinalOutput->getValue();
       else
         BaseInput = getDefaultImageName();
-      BaseInput =
-          C.getArgs().MakeArgString(std::string(BaseInput) + "-wrapper");
+      std::string BaseNm = std::string(BaseInput);
+      std::replace(BaseNm.begin(), BaseNm.end(), '.', '_');
+      BaseInput = C.getArgs().MakeArgString(BaseNm + "-wrapper");
     }
     Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch,
                                              AtTopLevel, MultipleArchs,
diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
@@ -68,6 +68,10 @@
                         const ToolChain &HostTC,
                         const llvm::opt::ArgList &Args);
 
+  AMDGPUOpenMPToolChain(const Driver &D, const llvm::Triple &Triple,
+                        const ToolChain &HostTC, const llvm::opt::ArgList &Args,
+                        const std::string OffloadArch);
+
   const llvm::Triple *getAuxTriple() const override {
     return &HostTC.getTriple();
   }
diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -166,7 +166,7 @@
   const toolchains::AMDGPUOpenMPToolChain &AMDGPUOpenMPTC =
       static_cast<const toolchains::AMDGPUOpenMPToolChain &>(TC);
 
-  std::string GPUArch = Args.getLastArgValue(options::OPT_march_EQ).str();
+  std::string GPUArch = AMDGPUOpenMPTC.getOffloadArch();
   if (GPUArch.empty()) {
     if (!checkSystemForAMDGPU(Args, AMDGPUOpenMPTC, GPUArch))
       return;
@@ -202,12 +202,21 @@
   getProgramPaths().push_back(getDriver().Dir);
 }
 
+AMDGPUOpenMPToolChain::AMDGPUOpenMPToolChain(const Driver &D,
+                                             const llvm::Triple &Triple,
+                                             const ToolChain &HostTC,
+                                             const ArgList &Args,
+                                             const std::string OffloadArch)
+    : ROCMToolChain(D, Triple, Args), HostTC(HostTC) {
+  getProgramPaths().push_back(getDriver().Dir);
+  setOffloadArch(OffloadArch);
+}
+
 void AMDGPUOpenMPToolChain::addClangTargetOptions(
     const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
     Action::OffloadKind DeviceOffloadingKind) const {
   HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
-
-  std::string GPUArch = DriverArgs.getLastArgValue(options::OPT_march_EQ).str();
+  std::string GPUArch = getOffloadArch();
   if (GPUArch.empty()) {
     if (!checkSystemForAMDGPU(DriverArgs, *this, GPUArch))
       return;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6659,20 +6659,32 @@
   }
 
   // For all the host OpenMP offloading compile jobs we need to pass the targets
-  // information using -fopenmp-targets= option.
+  // information using `-fopenmp-targets=` option.
   if (JA.isHostOffloading(Action::OFK_OpenMP)) {
     SmallString<128> TargetInfo("-fopenmp-targets=");
 
     Arg *Tgts = Args.getLastArg(options::OPT_fopenmp_targets_EQ);
-    assert(Tgts && Tgts->getNumValues() &&
-           "OpenMP offloading has to have targets specified.");
-    for (unsigned i = 0; i < Tgts->getNumValues(); ++i) {
-      if (i)
-        TargetInfo += ',';
-      // We need to get the string from the triple because it may be not exactly
-      // the same as the one we get directly from the arguments.
-      llvm::Triple T(Tgts->getValue(i));
-      TargetInfo += T.getTriple();
+    // Get list of device Toolchains
+    auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
+
+    if (Tgts && Tgts->getNumValues()) {
+      for (unsigned i = 0; i < Tgts->getNumValues(); ++i) {
+        if (i)
+          TargetInfo += ',';
+        // We need to get the string from the triple because it may be not
+        // exactly the same as the one we get directly from the arguments.
+        llvm::Triple T(Tgts->getValue(i));
+        TargetInfo += T.getTriple();
+      }
+    } else if (OpenMPTCRange.first != OpenMPTCRange.second) {
+      for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE;
+           ++TI) {
+        auto *deviceTC = TI->second;
+        TargetInfo += deviceTC->getTriple().str();
+      }
+    } else {
+      assert("OpenMP offloading requires target devices use \
+              `-fopenmp-targets=`");
     }
     CmdArgs.push_back(Args.MakeArgString(TargetInfo.str()));
   }
@@ -7668,18 +7680,17 @@
       });
     }
     Triples += Action::GetOffloadKindName(CurKind);
-    Triples += "-";
-    std::string NormalizedTriple = CurTC->getTriple().normalize();
-    Triples += NormalizedTriple;
-
-    if (CurDep->getOffloadingArch() != nullptr) {
-      // If OffloadArch is present it can only appear as the 6th hypen
-      // sepearated field of Bundle Entry ID. So, pad required number of
-      // hyphens in Triple.
-      for (int i = 4 - StringRef(NormalizedTriple).count("-"); i > 0; i--)
-        Triples += "-";
+    Triples += '-';
+    Triples += CurTC->getTriple().normalize();
+    if ((CurKind == Action::OFK_HIP || CurKind == Action::OFK_Cuda) &&
+        CurDep->getOffloadingArch()) {
+      Triples += '-';
       Triples += CurDep->getOffloadingArch();
     }
+    if (CurKind == Action::OFK_OpenMP && !CurTC->getOffloadArch().empty()) {
+      Triples += '-';
+      Triples += CurTC->getOffloadArch();
+    }
   }
   CmdArgs.push_back(TCArgs.MakeArgString(Triples));
 
@@ -7711,7 +7722,7 @@
   C.addCommand(std::make_unique<Command>(
       JA, *this, ResponseFileSupport::None(),
       TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
-      CmdArgs, None, Output));
+      CmdArgs, Inputs, Output));
 }
 
 void OffloadBundler::ConstructJobMultipleOutputs(
@@ -7746,20 +7757,21 @@
       Triples += ',';
 
     auto &Dep = DepInfo[I];
-    Triples += Action::GetOffloadKindName(Dep.DependentOffloadKind);
-    Triples += "-";
-    std::string NormalizedTriple =
-        Dep.DependentToolChain->getTriple().normalize();
-    Triples += NormalizedTriple;
-
-    if (!Dep.DependentBoundArch.empty()) {
-      // If OffloadArch is present it can only appear as the 6th hypen
-      // sepearated field of Bundle Entry ID. So, pad required number of
-      // hyphens in Triple.
-      for (int i = 4 - StringRef(NormalizedTriple).count("-"); i > 0; i--)
-        Triples += "-";
+    auto OffloadKind = Dep.DependentOffloadKind;
+    Triples += Action::GetOffloadKindName(OffloadKind);
+    Triples += '-';
+    Triples += Dep.DependentToolChain->getTriple().normalize();
+    if ((Dep.DependentOffloadKind == Action::OFK_HIP ||
+         Dep.DependentOffloadKind == Action::OFK_Cuda) &&
+        !Dep.DependentBoundArch.empty()) {
+      Triples += '-';
       Triples += Dep.DependentBoundArch;
     }
+    if (OffloadKind == Action::OFK_OpenMP &&
+        !Dep.DependentToolChain->getOffloadArch().empty()) {
+      Triples += '-';
+      Triples += Dep.DependentToolChain->getOffloadArch();
+    }
   }
 
   CmdArgs.push_back(TCArgs.MakeArgString(Triples));
@@ -7805,9 +7817,30 @@
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
-  // Add inputs.
+  auto TCs = C.getOffloadToolChains<Action::OFK_OpenMP>();
+
+  // Add runtime requirements on each image which includes the offload-arch
+  auto II = TCs.first;
   for (const InputInfo &I : Inputs) {
     assert(I.isFilename() && "Invalid input.");
+    if (I.getAction()) {
+      auto TC = II->second;
+      II++;
+      std::string requirements("--requirements=");
+      requirements.append(TC->getOffloadArch());
+      // targetid could have user specified features such as :xnack-:sramecc+
+      // so replace ":" with "__" in requirements used for
+      // clang-offload-wrapper.
+      size_t start_pos = 0;
+      while ((start_pos = requirements.find(":", start_pos)) !=
+             std::string::npos) {
+        requirements.replace(start_pos, 1, "__");
+        start_pos += 2;
+      }
+
+      // FIXME: Add other architecture requirements here
+      CmdArgs.push_back(Args.MakeArgString(requirements.c_str()));
+    }
     CmdArgs.push_back(I.getFilename());
   }
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h
--- a/clang/lib/Driver/ToolChains/Cuda.h
+++ b/clang/lib/Driver/ToolChains/Cuda.h
@@ -134,6 +134,10 @@
                 const ToolChain &HostTC, const llvm::opt::ArgList &Args,
                 const Action::OffloadKind OK);
 
+  CudaToolChain(const Driver &D, const llvm::Triple &Triple,
+                const ToolChain &HostTC, const llvm::opt::ArgList &Args,
+                const Action::OffloadKind OK, const std::string OffloadArch);
+
   const llvm::Triple *getAuxTriple() const override {
     return &HostTC.getTriple();
   }
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -404,6 +404,8 @@
   // flag or the default value.
   if (JA.isDeviceOffloading(Action::OFK_OpenMP)) {
     GPUArchName = Args.getLastArgValue(options::OPT_march_EQ);
+    if (GPUArchName.empty())
+      GPUArchName = TC.getOffloadArch();
     assert(!GPUArchName.empty() && "Must have an architecture passed in.");
   } else
     GPUArchName = JA.getOffloadingArch();
@@ -597,6 +599,9 @@
 
   StringRef GPUArch =
       Args.getLastArgValue(options::OPT_march_EQ);
+  if (GPUArch.empty())
+    GPUArch = getToolChain().getOffloadArch();
+
   assert(!GPUArch.empty() && "At least one GPU Arch required for ptxas.");
 
   CmdArgs.push_back("-arch");
@@ -659,6 +664,22 @@
   getProgramPaths().push_back(getDriver().Dir);
 }
 
+CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
+                             const ToolChain &HostTC, const ArgList &Args,
+                             const Action::OffloadKind OK,
+                             const std::string OffloadArch)
+    : ToolChain(D, Triple, Args), HostTC(HostTC),
+      CudaInstallation(D, HostTC.getTriple(), Args), OK(OK) {
+  if (CudaInstallation.isValid()) {
+    CudaInstallation.WarnIfUnsupportedVersion();
+    getProgramPaths().push_back(std::string(CudaInstallation.getBinPath()));
+  }
+  // Lookup binaries into the driver directory, this is used to
+  // discover the clang-offload-bundler executable.
+  getProgramPaths().push_back(getDriver().Dir);
+  setOffloadArch(OffloadArch);
+}
+
 std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {
   // Only object files are changed, for example assembly files keep their .s
   // extensions. CUDA also continues to use .o as they don't use nvlink but
@@ -680,6 +701,8 @@
   HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
 
   StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
+  if (GpuArch.empty())
+    GpuArch = getOffloadArch();
   assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
   assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
           DeviceOffloadingKind == Action::OFK_Cuda) &&
@@ -844,6 +867,8 @@
     }
 
     StringRef Arch = DAL->getLastArgValue(options::OPT_march_EQ);
+    if (Arch.empty())
+      Arch = getOffloadArch();
     if (Arch.empty())
       DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
                         CLANG_OPENMP_NVPTX_DEFAULT_ARCH);
diff --git a/clang/test/Driver/amdgpu-openmp-system-arch-fail.c b/clang/test/Driver/amdgpu-openmp-system-arch-fail.c
--- a/clang/test/Driver/amdgpu-openmp-system-arch-fail.c
+++ b/clang/test/Driver/amdgpu-openmp-system-arch-fail.c
@@ -15,14 +15,9 @@
 // case when amdgpu_arch returns nothing or fails
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_fail %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
-// NO-OUTPUT-ERROR: error: Cannot determine AMDGPU architecture{{.*}}Exited with error code 1. Consider passing it via --march
-
-// case when amdgpu_arch returns multiple gpus but all are different
-// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_different %s 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=MULTIPLE-OUTPUT-ERROR
-// MULTIPLE-OUTPUT-ERROR: error: Cannot determine AMDGPU architecture: Multiple AMD GPUs found with different archs. Consider passing it via --march
+// NO-OUTPUT-ERROR: fatal error: The option -fopenmp-targets= requires additional options -Xopenmp-target= and -march=
 
 // case when amdgpu_arch does not return anything with successful execution
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=EMPTY-OUTPUT
-// EMPTY-OUTPUT: error: Cannot determine AMDGPU architecture: No AMD GPU detected in the system. Consider passing it via --march
+// EMPTY-OUTPUT: fatal error: The option -fopenmp-targets= requires additional options -Xopenmp-target= and -march=
diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c
--- a/clang/test/Driver/amdgpu-openmp-toolchain.c
+++ b/clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -10,9 +10,9 @@
 // CHECK: llvm-link{{.*}}"-o" "{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked-{{.*}}.bc"
 // CHECK: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked-{{.*}}.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=obj" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-{{.*}}.o"
 // CHECK: lld{{.*}}"-flavor" "gnu" "--no-undefined" "-shared" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}.out" "{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-{{.*}}.o"
-// CHECK: clang-offload-wrapper{{.*}}"-target" "x86_64-unknown-linux-gnu" "-o" "{{.*}}a-{{.*}}.bc" {{.*}}amdgpu-openmp-toolchain-{{.*}}.out"
-// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "{{.*}}a-{{.*}}.o" "-x" "ir" "{{.*}}a-{{.*}}.bc"
-// CHECK: ld{{.*}}"-o" "a.out"{{.*}}"{{.*}}amdgpu-openmp-toolchain-{{.*}}.o" "{{.*}}a-{{.*}}.o" "-lomp" "-lomptarget"
+// CHECK: clang-offload-wrapper{{.*}}" "-target" "x86_64-unknown-linux-gnu" "-o" "{{.*}}a_{{.*}}.bc" "--requirements=gfx906" "{{.*}}amdgpu-openmp-toolchain-{{.*}}.out"
+// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "{{.*}}a_{{.*}}.o" "-x" "ir" "{{.*}}a_{{.*}}.bc"
+// CHECK: ld{{.*}}"-o" "a.out"{{.*}}"{{.*}}amdgpu-openmp-toolchain-{{.*}}.o" "{{.*}}a_{{.*}}.o" "-lomp" "-lomptarget"
 
 // RUN:   %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PHASES %s
@@ -26,14 +26,12 @@
 // CHECK-PHASES: 6: preprocessor, {5}, cpp-output, (device-openmp)
 // CHECK-PHASES: 7: compiler, {6}, ir, (device-openmp)
 // CHECK-PHASES: 8: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (amdgcn-amd-amdhsa)" {7}, ir
-// CHECK-PHASES: 9: backend, {8}, assembler, (device-openmp)
-// CHECK-PHASES: 10: assembler, {9}, object, (device-openmp)
-// CHECK-PHASES: 11: linker, {10}, image, (device-openmp)
-// CHECK-PHASES: 12: offload, "device-openmp (amdgcn-amd-amdhsa)" {11}, image
-// CHECK-PHASES: 13: clang-offload-wrapper, {12}, ir, (host-openmp)
-// CHECK-PHASES: 14: backend, {13}, assembler, (host-openmp)
-// CHECK-PHASES: 15: assembler, {14}, object, (host-openmp)
-// CHECK-PHASES: 16: linker, {4, 15}, image, (host-openmp)
+// CHECK-PHASES: 9: linker, {8}, image, (device-openmp)
+// CHECK-PHASES: 10: offload, "device-openmp (amdgcn-amd-amdhsa)" {9}, image
+// CHECK-PHASES: 11: clang-offload-wrapper, {10}, ir, (host-openmp)
+// CHECK-PHASES: 12: backend, {11}, assembler, (host-openmp)
+// CHECK-PHASES: 13: assembler, {12}, object, (host-openmp)
+// CHECK-PHASES: 14: linker, {4, 13}, image, (host-openmp)
 
 // handling of --libomptarget-amdgcn-bc-path
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET
@@ -73,4 +71,4 @@
 // CHECK-C: "x86_64-unknown-linux-gnu" - "offload bundler"
 
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
-// CHECK-EMIT-LLVM-IR: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm"
+// CHECK-EMIT-LLVM-IR: clang{{.*}}" "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm"
diff --git a/clang/test/Driver/hip-rdc-device-only.hip b/clang/test/Driver/hip-rdc-device-only.hip
--- a/clang/test/Driver/hip-rdc-device-only.hip
+++ b/clang/test/Driver/hip-rdc-device-only.hip
@@ -82,7 +82,7 @@
 // COMMON-SAME: {{.*}} {{".*a.cu"}}
 
 // COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}"
-// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // COMMON-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}"
 
 // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -112,7 +112,7 @@
 // COMMON-SAME: {{.*}} {{".*b.hip"}}
 
 // COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}"
-// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // COMMON-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}"
 
 // SAVETEMP: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu"
@@ -142,7 +142,7 @@
 // SAVETEMP-SAME: {{.*}} "-o" {{"a.*.ll"}} "-x" "ir" [[A_GFX900_TMP_BC]]
 
 // SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll"
-// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // SAVETEMP-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.ll"
 
 // SAVETEMP: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu"
@@ -172,7 +172,7 @@
 // SAVETEMP-SAME: {{.*}} "-o" {{"b.*.ll"}} "-x" "ir" [[B_GFX900_TMP_BC]]
 
 // SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll"
-// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // SAVETEMP-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.ll"
 
 // FAIL: error: cannot specify -o when generating multiple output files
diff --git a/clang/test/Driver/hip-toolchain-rdc-separate.hip b/clang/test/Driver/hip-toolchain-rdc-separate.hip
--- a/clang/test/Driver/hip-toolchain-rdc-separate.hip
+++ b/clang/test/Driver/hip-toolchain-rdc-separate.hip
@@ -44,7 +44,7 @@
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
 // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900,host-x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900,host-x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-outputs=[[A_O:.*a.o]]" "-inputs=[[A_BC1]],[[A_BC2]],[[A_OBJ_HOST]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -79,7 +79,7 @@
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
 // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900,host-x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900,host-x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-outputs=[[B_O:.*b.o]]" "-inputs=[[B_BC1]],[[B_BC2]],[[B_OBJ_HOST]]"
 
 // RUN: touch %T/a.o
@@ -91,22 +91,22 @@
 // RUN: 2>&1 | FileCheck -check-prefix=LINK %s
 
 // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // LINK-SAME: "-inputs=[[A_O:.*a.o]]" "-outputs=[[A_OBJ_HOST:.*o]],{{.*o}},{{.*o}}"
 // LINK: "-unbundle" "-allow-missing-bundles"
 
 // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // LINK-SAME: "-inputs=[[B_O:.*b.o]]" "-outputs=[[B_OBJ_HOST:.*o]],{{.*o}},{{.*o}}"
 // LINK: "-unbundle" "-allow-missing-bundles"
 
 // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // LINK-SAME: "-inputs=[[A_O]]" "-outputs={{.*o}},[[A_BC1:.*o]],[[A_BC2:.*o]]"
 // LINK: "-unbundle" "-allow-missing-bundles"
 
 // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // LINK-SAME: "-inputs=[[B_O]]" "-outputs={{.*o}},[[B_BC1:.*o]],[[B_BC2:.*o]]"
 // LINK: "-unbundle" "-allow-missing-bundles"
 
diff --git a/clang/test/Driver/openmp-offload-multi.c b/clang/test/Driver/openmp-offload-multi.c
new file mode 100644
--- /dev/null
+++ b/clang/test/Driver/openmp-offload-multi.c
@@ -0,0 +1,34 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+//
+// Legacy mode (-fopenmp-targets,-Xopenmp-target,-march) tests for
+// multi arch compilation
+//
+// RUN:   %clang -### -target x86_64-linux-gnu -fopenmp\
+// RUN:   -fopenmp-targets=amdgcn-amd-amdhsa,amdgcn-amd-amdhsa \
+// RUN:   -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 \
+// RUN:   -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 \
+// RUN:   %s 2>&1 | FileCheck %s
+
+// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-x" "c"{{.*}}
+// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "[[HOSTOBJ:.*.o]]" "-x" "ir"{{.*}}
+
+// compilation for offload target 1 : gfx906
+// CHECK: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm-bc"{{.*}}"-target-cpu" "gfx906" "-fcuda-is-device"{{.*}}"-o" "{{.*}}.bc" "-x" "c"{{.*}}.c
+// CHECK: llvm-link"{{.*}}openmp-offload-multi-{{.*}}.bc"{{.*}}"-o" "{{.*}}openmp-offload-multi-{{.*}}-gfx906-linked-{{.*}}.bc"
+// CHECK: llc{{.*}}openmp-offload-multi-{{.*}}-gfx906-linked-{{.*}}.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=obj"{{.*}}"-o"{{.*}}openmp-offload-multi-{{.*}}-gfx906-{{.*}}.o"
+// CHECK: lld{{.*}}"-flavor" "gnu" "--no-undefined" "-shared" "-o" "[[GFX906OUT:.*.out]]" "{{.*}}openmp-offload-multi-{{.*}}-gfx906-{{.*}}.o"
+
+// compilation for offload target 1 : gfx908
+// CHECK: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm-bc"{{.*}}"-target-cpu" "gfx908" "-fcuda-is-device"{{.*}}"-o" "{{.*}}.bc" "-x" "c"{{.*}}.c
+// CHECK: llvm-link"{{.*}}openmp-offload-multi-{{.*}}.bc"{{.*}}"-o" "{{.*}}openmp-offload-multi-{{.*}}-gfx908-linked-{{.*}}.bc"
+// CHECK: llc{{.*}}openmp-offload-multi-{{.*}}-gfx908-linked-{{.*}}.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx908" "-filetype=obj"{{.*}}"-o"{{.*}}openmp-offload-multi-{{.*}}-gfx908-{{.*}}.o"
+// CHECK: lld{{.*}}"-flavor" "gnu" "--no-undefined" "-shared" "-o" "[[GFX908OUT:.*.out]]" "{{.*}}openmp-offload-multi-{{.*}}-gfx908-{{.*}}.o"
+
+// Combining device images for offload targets
+// CHECK: clang-offload-wrapper"{{.*}}" "-o" "[[COMBINEDIR:.*.bc]]" "--requirements=gfx906" "[[GFX906OUT]]" "--requirements=gfx908" "[[GFX908OUT]]"
+
+// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fopenmp-targets=amdgcn-amd-amdhsa,amdgcn-amd-amdhsa"{{.*}}"-o" "[[COMBINEDOBJ:.*.o]]" "-x" "ir" "[[COMBINEDIR]]"
+// CHECK: ld.lld"{{.*}}" "-o" "a.out{{.*}}[[HOSTOBJ]]" "[[COMBINEDOBJ]]{{.*}}" "-lomp{{.*}}-lomptarget"
diff --git a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
--- a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
+++ b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
@@ -60,6 +60,11 @@
            cl::desc("Target triple for the output module"),
            cl::value_desc("triple"), cl::cat(ClangOffloadWrapperCategory));
 
+static cl::list<std::string>
+    OffloadArchs("requirements", cl::desc("requirements contains offload-arch"),
+                 cl::value_desc("requirements"),
+                 cl::cat(ClangOffloadWrapperCategory));
+
 namespace {
 
 class BinaryWrapper {
@@ -69,6 +74,7 @@
   StructType *EntryTy = nullptr;
   StructType *ImageTy = nullptr;
   StructType *DescTy = nullptr;
+  StructType *ImageInfoTy = nullptr;
 
 private:
   IntegerType *getSizeTTy() {
@@ -134,6 +140,27 @@
     return PointerType::getUnqual(getBinDescTy());
   }
 
+  // This matches the runtime struct definition of __tgt_image_info
+  // declared in openmp/libomptarget/include/omptarget.h /
+  // struct __tgt_image_info {
+  //   int32_t version;
+  //   int32_t image_number;
+  //   int32_t number_images;
+  //   char* requirements;
+  //   char* target_compile_opts;
+  // };
+  StructType *getImageInfoTy() {
+    if (!ImageInfoTy)
+      ImageInfoTy = StructType::create(
+          "__tgt_image_info", Type::getInt32Ty(C), Type::getInt32Ty(C),
+          Type::getInt32Ty(C), Type::getInt8PtrTy(C), Type::getInt8PtrTy(C));
+    return ImageInfoTy;
+  }
+
+  PointerType *getImageInfoPtrTy() {
+    return PointerType::getUnqual(getImageInfoTy());
+  }
+
   /// Creates binary descriptor for the given device images. Binary descriptor
   /// is an object that is passed to the offloading runtime at program startup
   /// and it describes all device images available in the executable or shared
@@ -245,7 +272,9 @@
                               ".omp_offloading.descriptor");
   }
 
-  void createRegisterFunction(GlobalVariable *BinDesc) {
+  void createRegisterFunction(GlobalVariable *BinDesc,
+                              ArrayRef<ArrayRef<char>> Requirements) {
+
     auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
     auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
                                   ".omp_offloading.descriptor_reg", &M);
@@ -259,6 +288,47 @@
 
     // Construct function body
     IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
+
+    // Create calls to __tgt_register_image_info for each image
+    auto *NullPtr = llvm::ConstantPointerNull::get(Builder.getInt8PtrTy());
+    auto *Zero = ConstantInt::get(getSizeTTy(), 0u);
+    auto *RegInfoFuncTy =
+        FunctionType::get(Type::getVoidTy(C), getImageInfoPtrTy(), false);
+    FunctionCallee RegInfoFuncC =
+        M.getOrInsertFunction("__tgt_register_image_info", RegInfoFuncTy);
+    unsigned int img_count = 0;
+    for (ArrayRef<char> Requirement : Requirements) {
+      Constant *RequirementV = ConstantDataArray::get(C, Requirement);
+      auto *GV =
+          new GlobalVariable(M, RequirementV->getType(), /*isConstant*/ true,
+                             GlobalValue::InternalLinkage, RequirementV,
+                             Twine("__offload_arch_" + Twine(img_count)));
+      GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+      // store value of these variables (i.e. offload archs) into a custom
+      // section which will be used by "offload-arch -f". It won't be
+      // removed during binary stripping.
+      GV->setSection(".offload_arch_list");
+
+      auto *RequirementVPtr =
+          ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zero);
+      RequirementVPtr =
+          ConstantExpr::getBitCast(RequirementVPtr, Type::getInt8PtrTy(C));
+      auto *InfoInit = ConstantStruct::get(
+          getImageInfoTy(), ConstantInt::get(Type::getInt32Ty(C), 1),
+          ConstantInt::get(Type::getInt32Ty(C), img_count),
+          ConstantInt::get(Type::getInt32Ty(C), (uint32_t)Requirements.size()),
+          RequirementVPtr,
+          NullPtr // TODO: capture target-compile-opts from clang driver
+      );
+      auto *ImageInfoGV = new GlobalVariable(
+          M, InfoInit->getType(),
+          /*isConstant*/ true, GlobalValue::InternalLinkage, InfoInit,
+          Twine(".offload_image_info_" + Twine(img_count++)));
+      ImageInfoGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+      Builder.CreateCall(RegInfoFuncC, ImageInfoGV);
+    }
+
     Builder.CreateCall(RegFuncC, BinDesc);
     Builder.CreateRetVoid();
 
@@ -298,10 +368,11 @@
     M.setTargetTriple(Target);
   }
 
-  const Module &wrapBinaries(ArrayRef<ArrayRef<char>> Binaries) {
+  const Module &wrapBinaries(ArrayRef<ArrayRef<char>> Binaries,
+                             ArrayRef<ArrayRef<char>> Requirements) {
     GlobalVariable *Desc = createBinDesc(Binaries);
     assert(Desc && "no binary descriptor");
-    createRegisterFunction(Desc);
+    createRegisterFunction(Desc, Requirements);
     createUnregisterFunction(Desc);
     return M;
   }
@@ -363,10 +434,20 @@
     return 1;
   }
 
+  SmallVector<ArrayRef<char>, 4u> Requirements;
+  Requirements.reserve(OffloadArchs.size());
+  for (unsigned i = 0; i != OffloadArchs.size(); ++i) {
+    OffloadArchs[i].append("\0");
+    Requirements.emplace_back(OffloadArchs[i].data(),
+                              OffloadArchs[i].size() + 1);
+  }
+
   // Create a wrapper for device binaries and write its bitcode to the file.
-  WriteBitcodeToFile(BinaryWrapper(Target).wrapBinaries(
-                         makeArrayRef(Images.data(), Images.size())),
-                     Out.os());
+  WriteBitcodeToFile(
+      BinaryWrapper(Target).wrapBinaries(
+          makeArrayRef(Images.data(), Images.size()),
+          makeArrayRef(Requirements.data(), Requirements.size())),
+      Out.os());
   if (Out.os().has_error()) {
     reportError(createFileError(Output, Out.os().error()));
     return 1;
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -120,6 +120,44 @@
   __tgt_offload_entry *HostEntriesEnd;   // End of table (non inclusive)
 };
 
+/// __tgt_image_info:
+///
+/// The information in this struct is provided in clang-offload-wrapper
+/// as a call to __tgt_register_image_info for each image in the library
+/// of images also created created by clang-offload-wrapper.
+/// __tgt_register_image_info is called for each image BEFORE the single
+/// call to __tgt_register_lib so that image information is available
+/// before they are loaded.  clang-offload-wrapper gets this image information
+/// from command line arguments provided by the clang driver when it creates
+/// the call to the __clang-offload-wrapper command.
+/// This architecture allows the binary image (pointed to by ImageStart and
+/// ImageEnd in __tgt_device_image) to remain architecture indenendent.
+/// That is, the architecture independent part of the libomptarget runtime
+/// does not need to peer inside the image to determine if it is loadable
+/// even though in most cases the image is an elf object.
+/// There is one __tgt_image_info for each __tgt_device_image. For backward
+/// compabibility, no changes are allowed to either __tgt_device_image or
+/// __tgt_bin_desc. The absense of __tgt_image_info is the indication that
+/// the runtime is being used on a binary created by an old version of
+/// the compiler.
+///
+struct __tgt_image_info {
+  int32_t version;           // The version of this struct
+  int32_t image_number;      // Image number in image library starting from 0
+  int32_t number_images;     // Number of images, used for initial allocation
+  char *requirements;        // e.g. sm_30, sm_70, gfx906, includes features
+  char *compile_opts;        // reserved for future use
+};
+
+/// __tgt_active_offload_env
+///
+/// This structure is created by __tgt_get_active_offload_env and is used
+/// to determine compatibility of the images with the current environment
+/// that is "in play".
+struct __tgt_active_offload_env {
+  char *capabilities; // string returned by offload-arch -r
+};
+
 /// This struct contains the offload entries identified by the target runtime
 struct __tgt_target_table {
   __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries
@@ -210,6 +248,13 @@
 /// adds a target shared library to the target execution image
 void __tgt_register_lib(__tgt_bin_desc *desc);
 
+/// adds an image information struct, called for each image
+void __tgt_register_image_info(__tgt_image_info *imageInfo);
+
+/// gets pointer to image information for specified image number
+/// Returns nullptr for apps built with old version of compiler
+__tgt_image_info *__tgt_get_image_info(uint32_t image_num);
+
 /// removes a target shared library from the target execution image
 void __tgt_unregister_lib(__tgt_bin_desc *desc);
 
diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -2,6 +2,7 @@
   global:
     __tgt_register_requires;
     __tgt_register_lib;
+    __tgt_register_image_info;
     __tgt_unregister_lib;
     __tgt_target_data_begin;
     __tgt_target_data_end;
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -43,6 +43,30 @@
   PM->RTLs.RegisterLib(desc);
 }
 
+static __tgt_image_info **__tgt_AllImageInfos;
+static int __tgt_num_registered_images = 0;
+EXTERN void __tgt_register_image_info(__tgt_image_info *imageInfo) {
+
+  DP(" register_image_info image %d of %d  requirements:%s VERSION:%d\n",
+     imageInfo->image_number, imageInfo->number_images, imageInfo->requirements,
+     imageInfo->version);
+
+  if (!__tgt_AllImageInfos)
+    __tgt_AllImageInfos = (__tgt_image_info **)malloc(
+        sizeof(__tgt_image_info *) * imageInfo->number_images);
+  __tgt_AllImageInfos[imageInfo->image_number] = imageInfo;
+  __tgt_num_registered_images = imageInfo->number_images;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Return pointer to image information if it was registered
+EXTERN __tgt_image_info *__tgt_get_image_info(unsigned image_number) {
+  if (__tgt_num_registered_images)
+    return __tgt_AllImageInfos[image_number];
+  else
+    return nullptr;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 /// unloads a target shared library
 EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) {
@@ -55,6 +79,10 @@
       }
     }
   }
+  if (__tgt_num_registered_images) {
+    free(__tgt_AllImageInfos);
+    __tgt_num_registered_images = 0;
+  }
 }
 
 /// creates host-to-target data mapping, stores it in the
diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -20,6 +20,7 @@
 #include <dlfcn.h>
 #include <mutex>
 #include <string>
+#include <sys/stat.h>
 
 // List of all plugins that can support offloading.
 static const char *RTLNames[] = {
@@ -288,18 +289,131 @@
      flags, RequiresFlags);
 }
 
+/// Query runtime capabilities of this system by calling offload-arch -c
+/// offload_arch_output_buffer is persistant storage returned by this
+/// __tgt_get_active_offload_env.
+static void
+__tgt_get_active_offload_env(__tgt_active_offload_env *active_env,
+                             char *offload_arch_output_buffer,
+                             size_t offload_arch_output_buffer_size) {
+  void *handle = dlopen("libomptarget.so", RTLD_NOW);
+  if (!handle)
+    DP("dlopen() failed: %s\n", dlerror());
+  char *libomptarget_dir_name = new char[PATH_MAX];
+  if (dlinfo(handle, RTLD_DI_ORIGIN, libomptarget_dir_name) == -1)
+    DP("RTLD_DI_ORIGIN failed: %s\n", dlerror());
+  std::string cmd_bin;
+  cmd_bin.assign(libomptarget_dir_name).append("/../bin/amdgpu-arch");
+  struct stat stat_buffer;
+  if (stat(cmd_bin.c_str(), &stat_buffer)) {
+    DP("Missing offload-arch command at %s \n", cmd_bin.c_str());
+  } else {
+    // Add option to print capabilities of current system
+    // cmd_bin.append(" -c");
+    FILE *stream = popen(cmd_bin.c_str(), "r");
+    while (fgets(offload_arch_output_buffer, offload_arch_output_buffer_size,
+                 stream) != NULL)
+      ;
+    pclose(stream);
+    active_env->capabilities = offload_arch_output_buffer;
+    size_t slen = strlen(active_env->capabilities);
+    offload_arch_output_buffer[slen - 1] =
+        '\0'; // terminate string before line feed
+    offload_arch_output_buffer +=
+        slen; // To store next value in offload_arch_output_buffer, not likely
+  }
+  delete[] libomptarget_dir_name;
+}
+
+std::vector<std::string> _splitstrings(char *input, const char *sep) {
+  std::vector<std::string> split_strings;
+  std::string s(input);
+  std::string delimiter(sep);
+  size_t pos = 0;
+  while ((pos = s.find(delimiter)) != std::string::npos) {
+    if (pos != 0)
+      split_strings.push_back(s.substr(0, pos));
+    s.erase(0, pos + delimiter.length());
+  }
+  if (s.length() > 1)
+    split_strings.push_back(s.substr(0, s.length()));
+  return split_strings;
+}
+
+static bool _ImageIsCompatibleWithEnv(__tgt_image_info *img_info,
+                                      __tgt_active_offload_env *active_env) {
+  // get_image_info will return null if no image information was registered.
+  // If no image information, assume application built with old compiler and
+  // check each image.
+  if (!img_info)
+    return true;
+
+  // Each runtime requirement for the compiled image is stored in
+  // the img_info->requirements string and is separated by __ .
+  // Each runtime capability obtained from "offload-arch -c" is stored in
+  // actvie_env->capabilities and is separated by spaces.
+  // If every requirement has a matching capability, then the image
+  // is compatible with active environment
+
+  std::vector<std::string> reqs = _splitstrings(img_info->requirements, "__");
+  std::vector<std::string> caps = _splitstrings(active_env->capabilities, " ");
+
+  bool is_compatible = true;
+  for (auto req : reqs) {
+    bool missing_capability = true;
+    for (auto capability : caps)
+      if (capability == req)
+        missing_capability = false;
+    if (missing_capability) {
+      DP("Image requires %s but runtime capability %s is missing.\n",
+         img_info->requirements, req.c_str());
+      is_compatible = false;
+    }
+  }
+  return is_compatible;
+}
+
+#define MAX_CAPS_STR_SIZE 1024
 void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
+
+  // Get the current active offload environment
+  __tgt_active_offload_env offload_env;
+  // Need a buffer to hold results of offload-arch -c command
+  size_t offload_arch_output_buffer_size = MAX_CAPS_STR_SIZE;
+  char *offload_arch_output_buffer =
+      (char *)malloc(offload_arch_output_buffer_size);
+  __tgt_get_active_offload_env(&offload_env, offload_arch_output_buffer,
+                               offload_arch_output_buffer_size);
+
+  bool requires_usm = (bool)(RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY);
+  bool has_xnack = (std::string(offload_env.capabilities).find("xnack+") !=
+                    std::string::npos);
+  bool is_amd = (std::string(offload_env.capabilities).find("gfx") == 0);
+  if (is_amd && requires_usm && !has_xnack) {
+    fprintf(stderr, "WARNING: USM SET WITHOUT XNACK ENABLED.\n");
+    fprintf(stderr, "         THIS WILL BECOME FATAL ERROR IN FUTURE.\n");
+  }
+#if 0
+    FATAL_MESSAGE0(1, "'#pragma omp requires unified_shared_memory' requires "
+                      "environment with xnack+ capability!");
+#endif
+
+  RTLInfoTy *FoundRTL = NULL;
   PM->RTLsMtx.lock();
   // Register the images with the RTLs that understand them, if any.
   for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
     // Obtain the image.
     __tgt_device_image *img = &desc->DeviceImages[i];
 
-    RTLInfoTy *FoundRTL = NULL;
-
+    // Get corresponding image info requirements and check with runtime
+    __tgt_image_info *img_info = __tgt_get_image_info(i);
+    if (!_ImageIsCompatibleWithEnv(img_info, &offload_env))
+      continue;
+    FoundRTL = NULL;
     // Scan the RTLs that have associated images until we find one that supports
     // the current image.
     for (auto &R : AllRTLs) {
+
       if (!R.is_valid_binary(img)) {
         DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
            DPxPTR(img->ImageStart), R.RTLName.c_str());
@@ -368,7 +482,41 @@
   }
   PM->RTLsMtx.unlock();
 
+  if (!FoundRTL) {
+    if (PM->TargetOffloadPolicy == tgt_mandatory)
+      fprintf(stderr, "ERROR:\
+	Runtime capabilities do NOT meet any offload image requirements\n\
+	and the OMP_TARGET_OFFLOAD policy is mandatory.  Terminating!\n\
+	Runtime capabilities : %s\n",
+              offload_env.capabilities);
+    else if (PM->TargetOffloadPolicy == tgt_disabled)
+      fprintf(stderr, "WARNING: Offloading is disabled.\n");
+    else
+      fprintf(
+          stderr,
+          "WARNING: Runtime capabilities do NOT meet any image requirements.\n\
+	 So device offloading is now disabled.\n\
+	Runtime capabilities : %s\n",
+          offload_env.capabilities);
+    if (PM->TargetOffloadPolicy != tgt_disabled) {
+      for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
+        __tgt_image_info *img_info = __tgt_get_image_info(i);
+        if (img_info)
+          fprintf(stderr, "\
+	  Image %d requirements : %s\n",
+                  i, img_info->requirements);
+        else
+          fprintf(stderr, "\
+	  Image %d has no requirements. Could be from older compiler\n",
+                  i);
+      }
+    }
+    if (PM->TargetOffloadPolicy == tgt_mandatory)
+      exit(1);
+  }
+
   DP("Done registering entries!\n");
+  free(offload_arch_output_buffer);
 }
 
 void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {