diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -2476,6 +2476,20 @@
   }
 };
 
+/// To encapsulate helper methods to be used by target specific specializations
+/// of CGOpenMPRuntimeGPU.
+class CodeGenUtil {
+public:
+  static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
+                                  QualType FieldTy);
+
+  template <class... As>
+  static llvm::GlobalVariable *createGlobalStruct(CodeGenModule &CGM, QualType Ty,
+                                          bool IsConstant,
+                                          ArrayRef<llvm::Constant *> Data,
+                                          const Twine &Name, As &&... Args);
+};
+
 } // namespace CodeGen
 } // namespace clang
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1048,17 +1048,6 @@
                             AlignmentSource::Decl);
 }
 
-static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
-                                       QualType FieldTy) {
-  auto *Field = FieldDecl::Create(
-      C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy,
-      C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()),
-      /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit);
-  Field->setAccess(AS_public);
-  DC->addDecl(Field);
-  return Field;
-}
-
 CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM, StringRef FirstSeparator,
                                  StringRef Separator)
     : CGM(CGM), FirstSeparator(FirstSeparator), Separator(Separator),
@@ -1352,11 +1341,21 @@
   }
 }
 
+FieldDecl *clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
+                                                QualType FieldTy) {
+  auto *Field = FieldDecl::Create(
+      C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy,
+      C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()),
+      /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit);
+  Field->setAccess(AS_public);
+  DC->addDecl(Field);
+  return Field;
+}
+
 template <class... As>
-static llvm::GlobalVariable *
-createGlobalStruct(CodeGenModule &CGM, QualType Ty, bool IsConstant,
-                   ArrayRef<llvm::Constant *> Data, const Twine &Name,
-                   As &&... Args) {
+llvm::GlobalVariable *clang::CodeGen::CodeGenUtil::createGlobalStruct(
+    CodeGenModule &CGM, QualType Ty, bool IsConstant,
+    ArrayRef<llvm::Constant *> Data, const Twine &Name, As &&... Args) {
   const auto *RD = cast<RecordDecl>(Ty->getAsTagDecl());
   const CGRecordLayout &RL = CGM.getTypes().getCGRecordLayout(RD);
   ConstantInitBuilder CIBuilder(CGM);
@@ -3072,7 +3071,7 @@
                             llvm::ConstantInt::get(CGM.Int32Ty, Flags),
                             llvm::ConstantInt::get(CGM.Int32Ty, 0)};
   std::string EntryName = getName({"omp_offloading", "entry", ""});
-  llvm::GlobalVariable *Entry = createGlobalStruct(
+  llvm::GlobalVariable *Entry = CodeGenUtil::createGlobalStruct(
       CGM, getTgtOffloadEntryQTy(), /*IsConstant=*/true, Data,
       Twine(EntryName).concat(Name), llvm::GlobalValue::WeakAnyLinkage);
 
@@ -3350,12 +3349,12 @@
     ASTContext &C = CGM.getContext();
     RecordDecl *RD = C.buildImplicitRecord("__tgt_offload_entry");
     RD->startDefinition();
-    addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-    addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy));
-    addFieldToRecordDecl(C, RD, C.getSizeType());
-    addFieldToRecordDecl(
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy));
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.getSizeType());
+    CodeGenUtil::addFieldToRecordDecl(
         C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
-    addFieldToRecordDecl(
+    CodeGenUtil::addFieldToRecordDecl(
         C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
     RD->completeDefinition();
     RD->addAttr(PackedAttr::CreateImplicit(C));
@@ -3399,7 +3398,7 @@
       if (Pair.second.isLocalPrivate() &&
           VD->getType()->isLValueReferenceType())
         Type = C.getPointerType(Type);
-      FieldDecl *FD = addFieldToRecordDecl(C, RD, Type);
+      FieldDecl *FD = CodeGenUtil::addFieldToRecordDecl(C, RD, Type);
       if (VD->hasAttrs()) {
         for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
              E(VD->getAttrs().end());
@@ -3433,27 +3432,27 @@
   //       };
   RecordDecl *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union);
   UD->startDefinition();
-  addFieldToRecordDecl(C, UD, KmpInt32Ty);
-  addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy);
+  CodeGenUtil::addFieldToRecordDecl(C, UD, KmpInt32Ty);
+  CodeGenUtil::addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy);
   UD->completeDefinition();
   QualType KmpCmplrdataTy = C.getRecordType(UD);
   RecordDecl *RD = C.buildImplicitRecord("kmp_task_t");
   RD->startDefinition();
-  addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy);
-  addFieldToRecordDecl(C, RD, KmpInt32Ty);
-  addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
-  addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt32Ty);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
   if (isOpenMPTaskLoopDirective(Kind)) {
     QualType KmpUInt64Ty =
         CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0);
     QualType KmpInt64Ty =
         CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
-    addFieldToRecordDecl(C, RD, KmpUInt64Ty);
-    addFieldToRecordDecl(C, RD, KmpUInt64Ty);
-    addFieldToRecordDecl(C, RD, KmpInt64Ty);
-    addFieldToRecordDecl(C, RD, KmpInt32Ty);
-    addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, KmpUInt64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, KmpUInt64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt32Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
   }
   RD->completeDefinition();
   return RD;
@@ -3469,9 +3468,9 @@
   //       };
   RecordDecl *RD = C.buildImplicitRecord("kmp_task_t_with_privates");
   RD->startDefinition();
-  addFieldToRecordDecl(C, RD, KmpTaskTQTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpTaskTQTy);
   if (const RecordDecl *PrivateRD = createPrivatesRecordDecl(CGM, Privates))
-    addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD));
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD));
   RD->completeDefinition();
   return RD;
 }
@@ -4100,9 +4099,9 @@
     RecordDecl *KmpAffinityInfoRD =
         C.buildImplicitRecord("kmp_task_affinity_info_t");
     KmpAffinityInfoRD->startDefinition();
-    addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getIntPtrType());
-    addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getSizeType());
-    addFieldToRecordDecl(C, KmpAffinityInfoRD, FlagsTy);
+    CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getIntPtrType());
+    CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getSizeType());
+    CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, FlagsTy);
     KmpAffinityInfoRD->completeDefinition();
     KmpTaskAffinityInfoTy = C.getRecordType(KmpAffinityInfoRD);
   }
@@ -4536,9 +4535,9 @@
   if (KmpDependInfoTy.isNull()) {
     RecordDecl *KmpDependInfoRD = C.buildImplicitRecord("kmp_depend_info");
     KmpDependInfoRD->startDefinition();
-    addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType());
-    addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType());
-    addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy);
+    CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType());
+    CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType());
+    CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy);
     KmpDependInfoRD->completeDefinition();
     KmpDependInfoTy = C.getRecordType(KmpDependInfoRD);
   }
@@ -5985,13 +5984,13 @@
   ASTContext &C = CGM.getContext();
   RecordDecl *RD = C.buildImplicitRecord("kmp_taskred_input_t");
   RD->startDefinition();
-  const FieldDecl *SharedFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *OrigFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *SizeFD = addFieldToRecordDecl(C, RD, C.getSizeType());
-  const FieldDecl *InitFD  = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *FiniFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *CombFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *FlagsFD = addFieldToRecordDecl(
+  const FieldDecl *SharedFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *OrigFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *SizeFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.getSizeType());
+  const FieldDecl *InitFD  = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *FiniFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *CombFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *FlagsFD = CodeGenUtil::addFieldToRecordDecl(
       C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false));
   RD->completeDefinition();
   QualType RDType = C.getRecordType(RD);
@@ -11104,9 +11103,9 @@
     // };
     RD = C.buildImplicitRecord("kmp_dim");
     RD->startDefinition();
-    addFieldToRecordDecl(C, RD, Int64Ty);
-    addFieldToRecordDecl(C, RD, Int64Ty);
-    addFieldToRecordDecl(C, RD, Int64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty);
     RD->completeDefinition();
     KmpDimTy = C.getRecordType(RD);
   } else {
@@ -11536,8 +11535,8 @@
   if (VI == I->getSecond().end()) {
     RecordDecl *RD = C.buildImplicitRecord("lasprivate.conditional");
     RD->startDefinition();
-    VDField = addFieldToRecordDecl(C, RD, VD->getType().getNonReferenceType());
-    FiredField = addFieldToRecordDecl(C, RD, C.CharTy);
+    VDField = CodeGenUtil::addFieldToRecordDecl(C, RD, VD->getType().getNonReferenceType());
+    FiredField = CodeGenUtil::addFieldToRecordDecl(C, RD, C.CharTy);
     RD->completeDefinition();
     NewType = C.getRecordType(RD);
     Address Addr = CGF.CreateMemTemp(NewType, C.getDeclAlign(VD), VD->getName());
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
--- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
@@ -27,6 +27,16 @@
 public:
   explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
 
+private:
+  /// Curret nesting level of parallel region
+  int ParallelLevel = 0;
+
+  /// Maximum nesting level of parallel region
+  int MaxParallelLevel = 0;
+
+  /// Struct to store kernel descriptors
+  QualType TgtAttributeStructQTy;
+
   /// Get the GPU warp size.
   llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
 
@@ -35,6 +45,72 @@
 
   /// Get the maximum number of threads in a block of the GPU.
   llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+
+  /// Allocate global variable for TransferMedium
+  llvm::GlobalVariable *
+  allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+                               StringRef TransferMediumName) override;
+
+  /// Allocate global variable for SharedStaticRD
+  llvm::GlobalVariable *
+  allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                               llvm::Type *LLVMStaticTy) override;
+
+  /// Get global variable KernelStaticGlobalized which is a shared pointer for
+  /// the global memory in the global memory buffer used for the given kernel
+  llvm::GlobalVariable *
+  allocateKernelStaticGlobalized(CodeGenModule &CGM) override;
+
+  /// Get target specific PrePostActionTy
+  PrePostActionTy *getPrePostActionTy() override;
+
+  /// Target independent wrapper over target specific emitSPMDKernel()
+  void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                             StringRef ParentName, llvm::Function *&OutlinedFn,
+                             llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                             const RegionCodeGenTy &CodeGen) override;
+
+  /// Target independent wrapper over target specific emitNonSPMDKernel()
+  void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                StringRef ParentName,
+                                llvm::Function *&OutlinedFn,
+                                llvm::Constant *&OutlinedFnID,
+                                bool IsOffloadEntry,
+                                const RegionCodeGenTy &CodeGen) override;
+
+  /// Create a unique global variable to indicate the flat-work-group-size
+  /// for this region. Values are [256..1024].
+  static void setPropertyWorkGroupSize(CodeGenModule &CGM, StringRef Name,
+                                       unsigned WGSize);
+
+  /// Generate global variables _wg_size, kern_desc, __tgt_attribute_struct.
+  /// Also generate appropriate value of attribute amdgpu-flat-work-group-size
+  void generateMetaData(CodeGenModule &CGM, const OMPExecutableDirective &D,
+                        llvm::Function *&OutlinedFn, bool IsGeneric);
+
+  /// Returns __tgt_attribute_struct type.
+  QualType getTgtAttributeStructQTy();
+
+  /// Emit structure descriptor for a kernel
+  void emitStructureKernelDesc(CodeGenModule &CGM, StringRef Name,
+                               int16_t WG_Size, int8_t Mode,
+                               int8_t HostServices, int8_t MaxParallelLevel);
+
+  class AMDGCNPrePostActionTy final : public PrePostActionTy {
+    int &ParallelLevel;
+    int &MaxParallelLevel;
+
+  public:
+    AMDGCNPrePostActionTy(int &ParallelLevel, int &MaxParallelLevel)
+        : ParallelLevel(ParallelLevel), MaxParallelLevel(MaxParallelLevel) {}
+    void Enter(CodeGenFunction &CGF) override {
+      // Count the number of nested parallels.
+      if (ParallelLevel > MaxParallelLevel)
+        MaxParallelLevel = ParallelLevel;
+      ParallelLevel++;
+    }
+    void Exit(CodeGenFunction &CGF) override { ParallelLevel--; }
+  };
 };
 
 } // namespace CodeGen
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGOpenMPRuntimeAMDGCN.h"
+#include "CGOpenMPRuntime.h"
 #include "CGOpenMPRuntimeGPU.h"
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
@@ -30,6 +31,7 @@
     : CGOpenMPRuntimeGPU(CGM) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP AMDGCN can only handle device code.");
+  StaticRDLinkage = llvm::GlobalValue::PrivateLinkage;
 }
 
 llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
@@ -59,3 +61,176 @@
   return Bld.CreateTrunc(
       Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
 }
+
+llvm::GlobalVariable *CGOpenMPRuntimeAMDGCN::allocateTransferMediumGlobal(
+    CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), Ty, /*isConstant=*/false,
+      llvm::GlobalVariable::WeakAnyLinkage, llvm::UndefValue::get(Ty),
+      TransferMediumName,
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+      /*isExternallyInitialized*/ true);
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeAMDGCN::allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                                                    llvm::Type *LLVMStaticTy) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), LLVMStaticTy,
+      /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+      llvm::UndefValue::get(LLVMStaticTy), "_openmp_shared_static_glob_rd_$_",
+      /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+      /*isExternallyInitialized*/ true);
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeAMDGCN::allocateKernelStaticGlobalized(CodeGenModule &CGM) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
+      llvm::GlobalValue::WeakAnyLinkage, llvm::UndefValue::get(CGM.VoidPtrTy),
+      "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
+      llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+      /*isExternallyInitialized*/ true);
+}
+
+void CGOpenMPRuntimeAMDGCN::setPropertyWorkGroupSize(CodeGenModule &CGM,
+                                                     StringRef Name,
+                                                     unsigned WGSize) {
+  auto *GVMode = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
+      llvm::GlobalValue::WeakAnyLinkage,
+      llvm::ConstantInt::get(CGM.Int16Ty, WGSize), Name + Twine("_wg_size"),
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_device),
+      /*isExternallyInitialized*/ false);
+  CGM.addCompilerUsedGlobal(GVMode);
+}
+
+void CGOpenMPRuntimeAMDGCN::generateMetaData(CodeGenModule &CGM,
+                                             const OMPExecutableDirective &D,
+                                             llvm::Function *&OutlinedFn,
+                                             bool IsGeneric) {
+  int FlatAttr = 0;
+  bool FlatAttrEmitted = false;
+  unsigned DefaultWorkGroupSz =
+      CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Default_WG_Size);
+
+  if (isOpenMPTeamsDirective(D.getDirectiveKind()) ||
+      isOpenMPParallelDirective(D.getDirectiveKind())) {
+    const auto *ThreadLimitClause = D.getSingleClause<OMPThreadLimitClause>();
+    const auto *NumThreadsClause = D.getSingleClause<OMPNumThreadsClause>();
+    unsigned MaxWorkGroupSz =
+        CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Max_WG_Size);
+    unsigned CompileTimeThreadLimit = 0;
+    // Only one of thread_limit or num_threads is used, cant do it for both
+    if (ThreadLimitClause && !NumThreadsClause) {
+      Expr *ThreadLimitExpr = ThreadLimitClause->getThreadLimit();
+      clang::Expr::EvalResult Result;
+      if (ThreadLimitExpr->EvaluateAsInt(Result, CGM.getContext()))
+        CompileTimeThreadLimit = Result.Val.getInt().getExtValue();
+    } else if (!ThreadLimitClause && NumThreadsClause) {
+      Expr *NumThreadsExpr = NumThreadsClause->getNumThreads();
+      clang::Expr::EvalResult Result;
+      if (NumThreadsExpr->EvaluateAsInt(Result, CGM.getContext()))
+        CompileTimeThreadLimit = Result.Val.getInt().getExtValue();
+    }
+
+    // Add kernel metadata if ThreadLimit Clause is compile time constant > 0
+    if (CompileTimeThreadLimit > 0) {
+      // Add the WarpSize to generic, to reflect what runtime dispatch does.
+      if (IsGeneric)
+        CompileTimeThreadLimit +=
+            CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Warp_Size);
+      if (CompileTimeThreadLimit > MaxWorkGroupSz)
+        CompileTimeThreadLimit = MaxWorkGroupSz;
+      std::string AttrVal = llvm::utostr(CompileTimeThreadLimit);
+      FlatAttr = CompileTimeThreadLimit;
+      OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
+                            AttrVal + "," + AttrVal);
+      setPropertyWorkGroupSize(CGM, OutlinedFn->getName(),
+                               CompileTimeThreadLimit);
+    }
+    FlatAttrEmitted = true;
+  } // end of amdgcn teams or parallel directive
+
+  // emit amdgpu-flat-work-group-size if not emitted already.
+  if (!FlatAttrEmitted) {
+    std::string FlatAttrVal = llvm::utostr(DefaultWorkGroupSz);
+    OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
+                          FlatAttrVal + "," + FlatAttrVal);
+  }
+  // Emit a kernel descriptor for runtime.
+  StringRef KernDescName = OutlinedFn->getName();
+  CGOpenMPRuntimeAMDGCN::emitStructureKernelDesc(CGM, KernDescName, FlatAttr,
+                                                 IsGeneric,
+                                                 1, // Uses HostServices
+                                                 MaxParallelLevel);
+  // Reset it to zero for any subsequent kernel
+  MaxParallelLevel = 0;
+}
+
+void CGOpenMPRuntimeAMDGCN::emitSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                 CodeGen);
+  generateMetaData(CGM, D, OutlinedFn, /*SPMD*/ false);
+}
+
+void CGOpenMPRuntimeAMDGCN::emitNonSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                    CodeGen);
+  generateMetaData(CGM, D, OutlinedFn, /*Generic*/ true);
+}
+
+PrePostActionTy *CGOpenMPRuntimeAMDGCN::getPrePostActionTy() {
+  return new AMDGCNPrePostActionTy(ParallelLevel, MaxParallelLevel);
+}
+
+/// Emit structure descriptor for a kernel
+void CGOpenMPRuntimeAMDGCN::emitStructureKernelDesc(
+    CodeGenModule &CGM, StringRef Name, int16_t WG_Size, int8_t Mode,
+    int8_t HostServices, int8_t MaxParallelLevel) {
+
+  // Create all device images
+  llvm::Constant *AttrData[] = {
+      llvm::ConstantInt::get(CGM.Int16Ty, 2), // Version
+      llvm::ConstantInt::get(CGM.Int16Ty, 9), // Size in bytes
+      llvm::ConstantInt::get(CGM.Int16Ty, WG_Size),
+      llvm::ConstantInt::get(CGM.Int8Ty, Mode), // 0 => SPMD, 1 => GENERIC
+      llvm::ConstantInt::get(CGM.Int8Ty, HostServices), // 1 => use HostServices
+      llvm::ConstantInt::get(CGM.Int8Ty, MaxParallelLevel)}; // number of nests
+
+  llvm::GlobalVariable *AttrImages = clang::CodeGen::CodeGenUtil::createGlobalStruct(
+      CGM, getTgtAttributeStructQTy(), isDefaultLocationConstant(), AttrData,
+      Name + Twine("_kern_desc"), llvm::GlobalValue::WeakAnyLinkage);
+  CGM.addCompilerUsedGlobal(AttrImages);
+}
+
+// Create Tgt Attribute Struct type.
+QualType CGOpenMPRuntimeAMDGCN::getTgtAttributeStructQTy() {
+  ASTContext &C = CGM.getContext();
+  QualType KmpInt8Ty = C.getIntTypeForBitwidth(/*Width=*/8, /*Signed=*/1);
+  QualType KmpInt16Ty = C.getIntTypeForBitwidth(/*Width=*/16, /*Signed=*/1);
+  if (TgtAttributeStructQTy.isNull()) {
+    RecordDecl *RD = C.buildImplicitRecord("__tgt_attribute_struct");
+    RD->startDefinition();
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt16Ty); // Version
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD,
+                                         KmpInt16Ty); // Struct Size in bytes.
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt16Ty); // WG_size
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty);  // Mode
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty);  // HostServices
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty);  // MaxParallelLevel
+    RD->completeDefinition();
+    TgtAttributeStructQTy = C.getRecordType(RD);
+  }
+  return TgtAttributeStructQTy;
+}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -33,6 +33,14 @@
     /// Unknown execution mode (orphaned directive).
     EM_Unknown,
   };
+
+protected:
+  /// true if we're definitely in the parallel region.
+  bool IsInParallelRegion = false;
+  
+  /// Linkage type of StaticRD Global variable
+  llvm::GlobalValue::LinkageTypes StaticRDLinkage;
+
 private:
   /// Parallel outlined function work for workers to execute.
   llvm::SmallVector<llvm::Function *, 16> Work;
@@ -99,36 +107,6 @@
                           uint64_t Size, int32_t Flags,
                           llvm::GlobalValue::LinkageTypes Linkage) override;
 
-  /// Emit outlined function specialized for the Fork-Join
-  /// programming model for applicable target directives on the NVPTX device.
-  /// \param D Directive to emit.
-  /// \param ParentName Name of the function that encloses the target region.
-  /// \param OutlinedFn Outlined function value to be defined by this call.
-  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
-  /// \param IsOffloadEntry True if the outlined function is an offload entry.
-  /// An outlined function may not be an entry if, e.g. the if clause always
-  /// evaluates to false.
-  void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
-                         llvm::Function *&OutlinedFn,
-                         llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
-                         const RegionCodeGenTy &CodeGen);
-
-  /// Emit outlined function specialized for the Single Program
-  /// Multiple Data programming model for applicable target directives on the
-  /// NVPTX device.
-  /// \param D Directive to emit.
-  /// \param ParentName Name of the function that encloses the target region.
-  /// \param OutlinedFn Outlined function value to be defined by this call.
-  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
-  /// \param IsOffloadEntry True if the outlined function is an offload entry.
-  /// \param CodeGen Object containing the target statements.
-  /// An outlined function may not be an entry if, e.g. the if clause always
-  /// evaluates to false.
-  void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
-                      llvm::Function *&OutlinedFn,
-                      llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
-                      const RegionCodeGenTy &CodeGen);
-
   /// Emit outlined function for 'target' directive on the NVPTX
   /// device.
   /// \param D Directive to emit.
@@ -194,6 +172,36 @@
   /// Full/Lightweight runtime mode. Used for better optimization.
   unsigned getDefaultLocationReserved2Flags() const override;
 
+  /// Emit outlined function specialized for the Fork-Join
+  /// programming model for applicable target directives on the NVPTX device.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                         llvm::Function *&OutlinedFn,
+                         llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                         const RegionCodeGenTy &CodeGen);
+
+  /// Emit outlined function specialized for the Single Program
+  /// Multiple Data programming model for applicable target directives on the
+  /// NVPTX device.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// \param CodeGen Object containing the target statements.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                      llvm::Function *&OutlinedFn,
+                      llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                      const RegionCodeGenTy &CodeGen);
+
 public:
   explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
   void clear() override;
@@ -211,6 +219,39 @@
   /// Get the maximum number of threads in a block of the GPU.
   virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;
 
+  /// Allocate global variable for TransferMedium
+  virtual llvm::GlobalVariable *
+  allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+                               StringRef TransferMediumName) = 0;
+
+  /// Allocate global variable for SharedStaticRD
+  virtual llvm::GlobalVariable *
+  allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                               llvm::Type *LLVMStaticTy) = 0;
+
+  /// Allocate global variable for KernelStaticGlobalized
+  virtual llvm::GlobalVariable *
+  allocateKernelStaticGlobalized(CodeGenModule &CGM) = 0;
+
+  /// Get target specific PrePostAction
+  virtual PrePostActionTy *getPrePostActionTy() = 0;
+
+  /// Target independent wrapper over target specific emitSPMDKernel()
+  virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                     StringRef ParentName,
+                                     llvm::Function *&OutlinedFn,
+                                     llvm::Constant *&OutlinedFnID,
+                                     bool IsOffloadEntry,
+                                     const RegionCodeGenTy &CodeGen) = 0;
+
+  /// Target independent wrapper over target specific emitNonSPMDKernel()
+  virtual void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                        StringRef ParentName,
+                                        llvm::Function *&OutlinedFn,
+                                        llvm::Constant *&OutlinedFnID,
+                                        bool IsOffloadEntry,
+                                        const RegionCodeGenTy &CodeGen) = 0;
+
   /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
   /// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
   virtual void emitProcBindClause(CodeGenFunction &CGF,
@@ -424,8 +465,6 @@
   /// true if currently emitting code for target/teams/distribute region, false
   /// - otherwise.
   bool IsInTTDRegion = false;
-  /// true if we're definitely in the parallel region.
-  bool IsInParallelRegion = false;
 
   /// Map between an outlined function and its wrapper.
   llvm::DenseMap<llvm::Function *, llvm::Function *> WrapperFunctionsMap;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGOpenMPRuntimeGPU.h"
+#include "CGOpenMPRuntimeAMDGCN.h"
 #include "CGOpenMPRuntimeNVPTX.h"
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
@@ -1190,13 +1191,8 @@
   // Reserve place for the globalized memory.
   GlobalizedRecords.emplace_back();
   if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+    auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+    KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM);
   }
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
@@ -1321,13 +1317,8 @@
   // Reserve place for the globalized memory.
   GlobalizedRecords.emplace_back();
   if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+    auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+    KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM);
   }
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
@@ -1888,12 +1879,11 @@
 
   bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
   if (Mode)
-    emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                   CodeGen);
+    emitSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID,
+                          IsOffloadEntry, CodeGen);
   else
-    emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                      CodeGen);
-
+    emitNonSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID,
+                             IsOffloadEntry, CodeGen);
   setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
 }
 
@@ -1963,22 +1953,9 @@
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
   // Emit target region as a standalone region.
-  class NVPTXPrePostActionTy : public PrePostActionTy {
-    bool &IsInParallelRegion;
-    bool PrevIsInParallelRegion;
-
-  public:
-    NVPTXPrePostActionTy(bool &IsInParallelRegion)
-        : IsInParallelRegion(IsInParallelRegion) {}
-    void Enter(CodeGenFunction &CGF) override {
-      PrevIsInParallelRegion = IsInParallelRegion;
-      IsInParallelRegion = true;
-    }
-    void Exit(CodeGenFunction &CGF) override {
-      IsInParallelRegion = PrevIsInParallelRegion;
-    }
-  } Action(IsInParallelRegion);
-  CodeGen.setAction(Action);
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+  std::unique_ptr<PrePostActionTy> Action(RT.getPrePostActionTy());
+  CodeGen.setAction(*Action);
   bool PrevIsInTTDRegion = IsInTTDRegion;
   IsInTTDRegion = false;
   bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
@@ -3231,6 +3208,7 @@
   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
 
   CGBuilderTy &Bld = CGF.Builder;
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
 
   // This array is used as a medium to transfer, one reduce element at a time,
   // the data from the first lane of every warp to lanes in the first warp
@@ -3246,16 +3224,11 @@
   unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
   if (!TransferMedium) {
     auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
-    unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
-    TransferMedium = new llvm::GlobalVariable(
-        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
-        llvm::Constant::getNullValue(Ty), TransferMediumName,
-        /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
-        SharedAddressSpace);
+    TransferMedium =
+        RT.allocateTransferMediumGlobal(CGM, Ty, TransferMediumName);
     CGM.addCompilerUsedGlobal(TransferMedium);
   }
 
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   // Get the CUDA thread id of the current OpenMP thread on the GPU.
   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
   // nvptx_lane_id = nvptx_id % warpsize
@@ -5104,6 +5077,7 @@
 }
 
 void CGOpenMPRuntimeGPU::clear() {
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
   if (!GlobalizedRecords.empty() &&
       !CGM.getLangOpts().OpenMPCUDATargetParallel) {
     ASTContext &C = CGM.getContext();
@@ -5152,9 +5126,6 @@
           llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
     }
     // Allocate SharedMemorySize buffer for the shared memory.
-    // FIXME: nvlink does not handle weak linkage correctly (object with the
-    // different size are reported as erroneous).
-    // Restore this code as sson as nvlink is fixed.
     if (!SharedStaticRD->field_empty()) {
       llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
       QualType SubTy = C.getConstantArrayType(
@@ -5171,13 +5142,7 @@
     if (!SharedStaticRD->field_empty()) {
       QualType StaticTy = C.getRecordType(SharedStaticRD);
       llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), LLVMStaticTy,
-          /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
-          llvm::Constant::getNullValue(LLVMStaticTy),
-          "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
-          llvm::GlobalValue::NotThreadLocal,
-          C.getTargetAddressSpace(LangAS::cuda_shared));
+      auto *GV = RT.allocateSharedStaticRDGlobal(CGM, LLVMStaticTy);
       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
           GV, CGM.VoidPtrTy);
       for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
@@ -5198,14 +5163,11 @@
           C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
                                  /*IndexTypeQuals=*/0);
       llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
-      // FIXME: nvlink does not handle weak linkage correctly (object with the
-      // different size are reported as erroneous).
-      // Restore CommonLinkage as soon as nvlink is fixed.
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), LLVMArr2Ty,
-          /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
-          llvm::Constant::getNullValue(LLVMArr2Ty),
-          "_openmp_static_glob_rd_$_");
+      auto *GV =
+          new llvm::GlobalVariable(CGM.getModule(), LLVMArr2Ty,
+                                   /*isConstant=*/false, RT.StaticRDLinkage,
+                                   llvm::Constant::getNullValue(LLVMArr2Ty),
+                                   "_openmp_static_glob_rd_$_");
       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
           GV, CGM.VoidPtrTy);
       for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
@@ -5238,7 +5200,7 @@
     // Restore CommonLinkage as soon as nvlink is fixed.
     auto *GV = new llvm::GlobalVariable(
         CGM.getModule(), LLVMReductionsBufferTy,
-        /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
+        /*isConstant=*/false, RT.StaticRDLinkage,
         llvm::Constant::getNullValue(LLVMReductionsBufferTy),
         "_openmp_teams_reductions_buffer_$_");
     KernelTeamsReductionPtr->setInitializer(
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -35,6 +35,54 @@
 
   /// Get the maximum number of threads in a block of the GPU.
   llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+
+  /// Allocate global variable for TransferMedium
+  llvm::GlobalVariable *allocateTransferMediumGlobal(CodeGenModule &CGM,
+                                                     llvm::ArrayType *Ty,
+                                                     StringRef Name) override;
+
+  /// Allocate global variable for SharedStaticRD
+  llvm::GlobalVariable *
+  allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                               llvm::Type *LLVMStaticTy) override;
+
+  /// Allocate global variable for KernelStaticGlobalized
+  llvm::GlobalVariable *
+  allocateKernelStaticGlobalized(CodeGenModule &CGM) override;
+
+  /// Get target specific PrePostAction
+  PrePostActionTy *getPrePostActionTy() override;
+
+  /// Target independent wrapper over target specific emitSPMDKernel()
+  void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                             StringRef ParentName, llvm::Function *&OutlinedFn,
+                             llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                             const RegionCodeGenTy &CodeGen) override;
+
+  /// Target independent wrapper over target specific emitNonSPMDKernel()
+  void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                StringRef ParentName,
+                                llvm::Function *&OutlinedFn,
+                                llvm::Constant *&OutlinedFnID,
+                                bool IsOffloadEntry,
+                                const RegionCodeGenTy &CodeGen) override;
+
+  /// NVPTX specific class for PrePostActionTy
+  class NVPTXPrePostActionTy final : public PrePostActionTy {
+    bool &IsInParallelRegion;
+    bool PrevIsInParallelRegion;
+
+  public:
+    NVPTXPrePostActionTy(bool &IsInParallelRegion)
+        : IsInParallelRegion(IsInParallelRegion) {}
+    void Enter(CodeGenFunction &CGF) override {
+      PrevIsInParallelRegion = IsInParallelRegion;
+      IsInParallelRegion = true;
+    }
+    void Exit(CodeGenFunction &CGF) override {
+      IsInParallelRegion = PrevIsInParallelRegion;
+    }
+  };
 };
 
 } // CodeGen namespace.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -30,6 +30,11 @@
     : CGOpenMPRuntimeGPU(CGM) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP NVPTX can only handle device code.");
+
+  // FIXME: nvlink does not handle weak linkage correctly (object with the
+  // different size are reported as erroneous).
+  // Restore CommonLinkage as soon as nvlink is fixed.
+  StaticRDLinkage = llvm::GlobalValue::InternalLinkage;
 }
 
 llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
@@ -54,3 +59,56 @@
       &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
   return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
 }
+
+llvm::GlobalVariable *CGOpenMPRuntimeNVPTX::allocateTransferMediumGlobal(
+    CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), Ty, /*isConstant=*/false,
+      llvm::GlobalVariable::CommonLinkage, llvm::Constant::getNullValue(Ty),
+      TransferMediumName,
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeNVPTX::allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                                                   llvm::Type *LLVMStaticTy) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), LLVMStaticTy,
+      /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
+      llvm::Constant::getNullValue(LLVMStaticTy),
+      "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
+      llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeNVPTX::allocateKernelStaticGlobalized(CodeGenModule &CGM) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
+      llvm::GlobalValue::InternalLinkage,
+      llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
+      "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
+      llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+void CGOpenMPRuntimeNVPTX::emitSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                 CodeGen);
+}
+
+void CGOpenMPRuntimeNVPTX::emitNonSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                    CodeGen);
+}
+
+PrePostActionTy *CGOpenMPRuntimeNVPTX::getPrePostActionTy() {
+  return new NVPTXPrePostActionTy(IsInParallelRegion);
+}
diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp
--- a/clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -8,6 +8,29 @@
 
 #define N 1000
 
+// CHECK: @"_openmp_kernel_static_glob_rd$ptr" = weak addrspace(3) externally_initialized global i8* undef
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l[[LINE1:.+]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 1, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l[[LINE1]]_exec_mode = weak constant i8 1
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l[[LINE2:.+]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 0, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l[[LINE2]]_exec_mode = weak constant i8 0
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_max_parallel_levelv_l[[LINE3:.+]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 0, i8 1, i8 3 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_max_parallel_levelv_l[[LINE3]]_exec_mode = weak constant i8 0
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l[[LINE4:.+]]_wg_size = weak addrspace(1) constant i16 10
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l[[LINE4]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 10, i8 0, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l[[LINE4]]_exec_mode = weak constant i8 0
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l[[LINE5:.+]]_wg_size = weak addrspace(1) constant i16 74
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l[[LINE5]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 74, i8 1, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l[[LINE5]]_exec_mode = weak constant i8 1
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_max_work_group_sizev_l[[LINE6:.+]]_wg_size = weak addrspace(1) constant i16 1024
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_max_work_group_sizev_l[[LINE6]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 1024, i8 1, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_max_work_group_sizev_l[[LINE6]]_exec_mode = weak constant i8 1
+
 int test_amdgcn_target_tid_threads() {
 // CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads
 
@@ -40,4 +63,65 @@
   return arr[0];
 }
 
+int test_amdgcn_target_max_parallel_level() {
+  // CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_max_parallel_level
+  int arr[N];
+
+#pragma omp target parallel for
+  for (int i = 0; i < N; i++)
+#pragma omp parallel for
+    for (int j = 0; j < N; j++)
+#pragma omp parallel for
+      for (int k = 0; k < N; k++)
+        for (int l = 0; l < N; l++)
+#pragma omp parallel for
+          for (int m = 0; m < N; m++)
+            arr[m] = 0;
+
+  return arr[0];
+}
+
+int test_amdgcn_target_attributes_spmd() {
+  int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="10,10"
+#pragma omp target parallel num_threads(10)
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
+int test_amdgcn_target_attributes_non_spmd() {
+  int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="74,74"
+#pragma omp target teams thread_limit(10)
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
+int test_amdgcn_target_attributes_max_work_group_size() {
+  int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="1024,1024"
+#pragma omp target teams thread_limit(1500)
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
 #endif
+
+// CHECK: !0 = !{i32 0, i32 [[ARG1:[0-9]+]], i32 [[ARG2:[0-9]+]], !"_Z37test_amdgcn_target_max_parallel_levelv", i32 [[LINE3]], i32 2}
+// CHECK: !1 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z30test_amdgcn_target_tid_threadsv", i32 [[LINE1]], i32 0}
+// CHECK: !2 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z35test_amdgcn_target_tid_threads_simdv", i32 [[LINE2]], i32 1}
+// CHECK: !3 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z38test_amdgcn_target_attributes_non_spmdv", i32 [[LINE5]], i32 4}
+// CHECK: !4 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z34test_amdgcn_target_attributes_spmdv", i32 [[LINE4]], i32 3}
+// CHECK: !5 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z49test_amdgcn_target_attributes_max_work_group_sizev", i32 [[LINE6]], i32 5}
\ No newline at end of file