diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -2488,6 +2488,20 @@
   }
 };
 
+/// To encapsulate helper methods to be used by target specific specializations
+/// of CGOpenMPRuntimeGPU.
+class CodeGenUtil {
+public:
+  static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
+                                         QualType FieldTy);
+
+  template <class... As>
+  static llvm::GlobalVariable *
+  createGlobalStruct(CodeGenModule &CGM, QualType Ty, bool IsConstant,
+                     ArrayRef<llvm::Constant *> Data, const Twine &Name,
+                     As &&... Args);
+};
+
 } // namespace CodeGen
 } // namespace clang
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1048,17 +1048,6 @@
                             AlignmentSource::Decl);
 }
 
-static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
-                                       QualType FieldTy) {
-  auto *Field = FieldDecl::Create(
-      C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy,
-      C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()),
-      /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit);
-  Field->setAccess(AS_public);
-  DC->addDecl(Field);
-  return Field;
-}
-
 CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM, StringRef FirstSeparator,
                                  StringRef Separator)
     : CGM(CGM), FirstSeparator(FirstSeparator), Separator(Separator),
@@ -1352,11 +1341,21 @@
   }
 }
 
+FieldDecl *clang::CodeGen::CodeGenUtil::CodeGenUtil::addFieldToRecordDecl(
+    ASTContext &C, DeclContext *DC, QualType FieldTy) {
+  auto *Field = FieldDecl::Create(
+      C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy,
+      C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()),
+      /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit);
+  Field->setAccess(AS_public);
+  DC->addDecl(Field);
+  return Field;
+}
+
 template <class... As>
-static llvm::GlobalVariable *
-createGlobalStruct(CodeGenModule &CGM, QualType Ty, bool IsConstant,
-                   ArrayRef<llvm::Constant *> Data, const Twine &Name,
-                   As &&... Args) {
+llvm::GlobalVariable *clang::CodeGen::CodeGenUtil::createGlobalStruct(
+    CodeGenModule &CGM, QualType Ty, bool IsConstant,
+    ArrayRef<llvm::Constant *> Data, const Twine &Name, As &&... Args) {
   const auto *RD = cast<RecordDecl>(Ty->getAsTagDecl());
   const CGRecordLayout &RL = CGM.getTypes().getCGRecordLayout(RD);
   ConstantInitBuilder CIBuilder(CGM);
@@ -3082,7 +3081,7 @@
       llvm::ConstantInt::get(CGM.Int32Ty, Flags),
       llvm::ConstantInt::get(CGM.Int32Ty, 0)};
   std::string EntryName = getName({"omp_offloading", "entry", ""});
-  llvm::GlobalVariable *Entry = createGlobalStruct(
+  llvm::GlobalVariable *Entry = CodeGenUtil::createGlobalStruct(
       CGM, getTgtOffloadEntryQTy(), /*IsConstant=*/true, Data,
       Twine(EntryName).concat(Name), llvm::GlobalValue::WeakAnyLinkage);
 
@@ -3360,12 +3359,12 @@
     ASTContext &C = CGM.getContext();
     RecordDecl *RD = C.buildImplicitRecord("__tgt_offload_entry");
     RD->startDefinition();
-    addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-    addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy));
-    addFieldToRecordDecl(C, RD, C.getSizeType());
-    addFieldToRecordDecl(
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy));
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.getSizeType());
+    CodeGenUtil::addFieldToRecordDecl(
         C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
-    addFieldToRecordDecl(
+    CodeGenUtil::addFieldToRecordDecl(
         C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
     RD->completeDefinition();
     RD->addAttr(PackedAttr::CreateImplicit(C));
@@ -3423,7 +3422,7 @@
         if (isAllocatableDecl(VD))
           Type = C.getPointerType(Type);
       }
-      FieldDecl *FD = addFieldToRecordDecl(C, RD, Type);
+      FieldDecl *FD = CodeGenUtil::addFieldToRecordDecl(C, RD, Type);
       if (VD->hasAttrs()) {
         for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
              E(VD->getAttrs().end());
@@ -3457,27 +3456,27 @@
   //       };
   RecordDecl *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union);
   UD->startDefinition();
-  addFieldToRecordDecl(C, UD, KmpInt32Ty);
-  addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy);
+  CodeGenUtil::addFieldToRecordDecl(C, UD, KmpInt32Ty);
+  CodeGenUtil::addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy);
   UD->completeDefinition();
   QualType KmpCmplrdataTy = C.getRecordType(UD);
   RecordDecl *RD = C.buildImplicitRecord("kmp_task_t");
   RD->startDefinition();
-  addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy);
-  addFieldToRecordDecl(C, RD, KmpInt32Ty);
-  addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
-  addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt32Ty);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
   if (isOpenMPTaskLoopDirective(Kind)) {
     QualType KmpUInt64Ty =
         CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0);
     QualType KmpInt64Ty =
         CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
-    addFieldToRecordDecl(C, RD, KmpUInt64Ty);
-    addFieldToRecordDecl(C, RD, KmpUInt64Ty);
-    addFieldToRecordDecl(C, RD, KmpInt64Ty);
-    addFieldToRecordDecl(C, RD, KmpInt32Ty);
-    addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, KmpUInt64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, KmpUInt64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt32Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
   }
   RD->completeDefinition();
   return RD;
@@ -3493,9 +3492,9 @@
   //       };
   RecordDecl *RD = C.buildImplicitRecord("kmp_task_t_with_privates");
   RD->startDefinition();
-  addFieldToRecordDecl(C, RD, KmpTaskTQTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpTaskTQTy);
   if (const RecordDecl *PrivateRD = createPrivatesRecordDecl(CGM, Privates))
-    addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD));
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD));
   RD->completeDefinition();
   return RD;
 }
@@ -4128,9 +4127,9 @@
     RecordDecl *KmpAffinityInfoRD =
         C.buildImplicitRecord("kmp_task_affinity_info_t");
     KmpAffinityInfoRD->startDefinition();
-    addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getIntPtrType());
-    addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getSizeType());
-    addFieldToRecordDecl(C, KmpAffinityInfoRD, FlagsTy);
+    CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getIntPtrType());
+    CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getSizeType());
+    CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, FlagsTy);
     KmpAffinityInfoRD->completeDefinition();
     KmpTaskAffinityInfoTy = C.getRecordType(KmpAffinityInfoRD);
   }
@@ -4568,9 +4567,9 @@
   if (KmpDependInfoTy.isNull()) {
     RecordDecl *KmpDependInfoRD = C.buildImplicitRecord("kmp_depend_info");
     KmpDependInfoRD->startDefinition();
-    addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType());
-    addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType());
-    addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy);
+    CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType());
+    CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType());
+    CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy);
     KmpDependInfoRD->completeDefinition();
     KmpDependInfoTy = C.getRecordType(KmpDependInfoRD);
   }
@@ -6017,13 +6016,19 @@
   ASTContext &C = CGM.getContext();
   RecordDecl *RD = C.buildImplicitRecord("kmp_taskred_input_t");
   RD->startDefinition();
-  const FieldDecl *SharedFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *OrigFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *SizeFD = addFieldToRecordDecl(C, RD, C.getSizeType());
-  const FieldDecl *InitFD  = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *FiniFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *CombFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *FlagsFD = addFieldToRecordDecl(
+  const FieldDecl *SharedFD =
+      CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *OrigFD =
+      CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *SizeFD =
+      CodeGenUtil::addFieldToRecordDecl(C, RD, C.getSizeType());
+  const FieldDecl *InitFD =
+      CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *FiniFD =
+      CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *CombFD =
+      CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *FlagsFD = CodeGenUtil::addFieldToRecordDecl(
       C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false));
   RD->completeDefinition();
   QualType RDType = C.getRecordType(RD);
@@ -9058,9 +9063,9 @@
   RecordDecl *RD;
   RD = C.buildImplicitRecord("descriptor_dim");
   RD->startDefinition();
-  addFieldToRecordDecl(C, RD, Int64Ty);
-  addFieldToRecordDecl(C, RD, Int64Ty);
-  addFieldToRecordDecl(C, RD, Int64Ty);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty);
   RD->completeDefinition();
   QualType DimTy = C.getRecordType(RD);
 
@@ -11669,9 +11674,9 @@
     // };
     RD = C.buildImplicitRecord("kmp_dim");
     RD->startDefinition();
-    addFieldToRecordDecl(C, RD, Int64Ty);
-    addFieldToRecordDecl(C, RD, Int64Ty);
-    addFieldToRecordDecl(C, RD, Int64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty);
     RD->completeDefinition();
     KmpDimTy = C.getRecordType(RD);
   } else {
@@ -12124,8 +12129,9 @@
   if (VI == I->getSecond().end()) {
     RecordDecl *RD = C.buildImplicitRecord("lasprivate.conditional");
     RD->startDefinition();
-    VDField = addFieldToRecordDecl(C, RD, VD->getType().getNonReferenceType());
-    FiredField = addFieldToRecordDecl(C, RD, C.CharTy);
+    VDField = CodeGenUtil::addFieldToRecordDecl(
+        C, RD, VD->getType().getNonReferenceType());
+    FiredField = CodeGenUtil::addFieldToRecordDecl(C, RD, C.CharTy);
     RD->completeDefinition();
     NewType = C.getRecordType(RD);
     Address Addr = CGF.CreateMemTemp(NewType, C.getDeclAlign(VD), VD->getName());
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
--- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
@@ -27,6 +27,10 @@
 public:
   explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
 
+private:
+  /// Struct to store kernel descriptors
+  QualType TgtAttributeStructQTy;
+
   /// Get the GPU warp size.
   llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
 
@@ -35,6 +39,38 @@
 
   /// Get the maximum number of threads in a block of the GPU.
   llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+
+  /// Target independent wrapper over target specific emitSPMDKernel()
+  void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                             StringRef ParentName, llvm::Function *&OutlinedFn,
+                             llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                             const RegionCodeGenTy &CodeGen) override;
+
+  /// Target independent wrapper over target specific emitNonSPMDKernel()
+  void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                StringRef ParentName,
+                                llvm::Function *&OutlinedFn,
+                                llvm::Constant *&OutlinedFnID,
+                                bool IsOffloadEntry,
+                                const RegionCodeGenTy &CodeGen) override;
+
+  /// Create a unique global variable to indicate the flat-work-group-size
+  /// for this region. Values are [256..1024].
+  static void setPropertyWorkGroupSize(CodeGenModule &CGM, StringRef Name,
+                                       unsigned WGSize);
+
+  /// Generate global variables _wg_size, kern_desc, __tgt_attribute_struct.
+  /// Also generate appropriate value of attribute amdgpu-flat-work-group-size
+  void generateMetaData(CodeGenModule &CGM, const OMPExecutableDirective &D,
+                        llvm::Function *&OutlinedFn, bool IsGeneric);
+
+  /// Returns __tgt_attribute_struct type.
+  QualType getTgtAttributeStructQTy();
+
+  /// Emit structure descriptor for a kernel
+  void emitStructureKernelDesc(CodeGenModule &CGM, StringRef Name,
+                               int16_t WG_Size, int8_t Mode,
+                               int8_t HostServices);
 };
 
 } // namespace CodeGen
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGOpenMPRuntimeAMDGCN.h"
+#include "CGOpenMPRuntime.h"
 #include "CGOpenMPRuntimeGPU.h"
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
@@ -26,10 +27,14 @@
 using namespace CodeGen;
 using namespace llvm::omp;
 
+//
+// Definitions of all virtual methods defined in CGOpenMPRuntimeGPU
+//
 CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM)
     : CGOpenMPRuntimeGPU(CGM) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP AMDGCN can only handle device code.");
+  KernelStaticGlobalizedLinkage = llvm::GlobalValue::WeakAnyLinkage;
 }
 
 llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
@@ -59,3 +64,145 @@
   return Bld.CreateTrunc(
       Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
 }
+
+void CGOpenMPRuntimeAMDGCN::emitSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                 CodeGen);
+  generateMetaData(CGM, D, OutlinedFn, /*SPMD*/ false);
+}
+
+void CGOpenMPRuntimeAMDGCN::emitNonSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                    CodeGen);
+  generateMetaData(CGM, D, OutlinedFn, /*Generic*/ true);
+}
+
+//
+// Definitions of AMDGCN specific methods
+//
+void CGOpenMPRuntimeAMDGCN::setPropertyWorkGroupSize(CodeGenModule &CGM,
+                                                     StringRef Name,
+                                                     unsigned WGSize) {
+  auto *GVMode = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
+      llvm::GlobalValue::WeakAnyLinkage,
+      llvm::ConstantInt::get(CGM.Int16Ty, WGSize), Twine(Name, "_wg_size"),
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
+
+  CGM.addCompilerUsedGlobal(GVMode);
+}
+
+void CGOpenMPRuntimeAMDGCN::generateMetaData(CodeGenModule &CGM,
+                                             const OMPExecutableDirective &D,
+                                             llvm::Function *&OutlinedFn,
+                                             bool IsGeneric) {
+  int FlatAttr = 0;
+  bool FlatAttrEmitted = false;
+  unsigned DefaultWorkGroupSz =
+      CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Default_WG_Size);
+
+  if (isOpenMPTeamsDirective(D.getDirectiveKind()) ||
+      isOpenMPParallelDirective(D.getDirectiveKind())) {
+    const auto *ThreadLimitClause = D.getSingleClause<OMPThreadLimitClause>();
+    const auto *NumThreadsClause = D.getSingleClause<OMPNumThreadsClause>();
+    unsigned MaxWorkGroupSz =
+        CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Max_WG_Size);
+    unsigned CompileTimeThreadLimit = 0;
+    // Only one of thread_limit or num_threads is used, cant do it for both
+    if (ThreadLimitClause && !NumThreadsClause) {
+      Expr *ThreadLimitExpr = ThreadLimitClause->getThreadLimit();
+      clang::Expr::EvalResult Result;
+      if (ThreadLimitExpr->EvaluateAsInt(Result, CGM.getContext()))
+        CompileTimeThreadLimit = Result.Val.getInt().getExtValue();
+    } else if (!ThreadLimitClause && NumThreadsClause) {
+      Expr *NumThreadsExpr = NumThreadsClause->getNumThreads();
+      clang::Expr::EvalResult Result;
+      if (NumThreadsExpr->EvaluateAsInt(Result, CGM.getContext()))
+        CompileTimeThreadLimit = Result.Val.getInt().getExtValue();
+    }
+
+    // Add kernel metadata if ThreadLimit Clause is compile time constant > 0
+    if (CompileTimeThreadLimit > 0) {
+      // Add the WarpSize to generic, to reflect what runtime dispatch does.
+      if (IsGeneric)
+        CompileTimeThreadLimit +=
+            CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Warp_Size);
+      if (CompileTimeThreadLimit > MaxWorkGroupSz)
+        CompileTimeThreadLimit = MaxWorkGroupSz;
+      std::string AttrVal = llvm::utostr(CompileTimeThreadLimit);
+      FlatAttr = CompileTimeThreadLimit;
+      OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
+                            AttrVal + "," + AttrVal);
+      setPropertyWorkGroupSize(CGM, OutlinedFn->getName(),
+                               CompileTimeThreadLimit);
+    }
+    FlatAttrEmitted = true;
+  } // end of amdgcn teams or parallel directive
+
+  // emit amdgpu-flat-work-group-size if not emitted already.
+  if (!FlatAttrEmitted) {
+    std::string FlatAttrVal = llvm::utostr(DefaultWorkGroupSz);
+    OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
+                          FlatAttrVal + "," + FlatAttrVal);
+  }
+  // Emit a kernel descriptor for runtime.
+  StringRef KernDescName = OutlinedFn->getName();
+  CGOpenMPRuntimeAMDGCN::emitStructureKernelDesc(
+      CGM, KernDescName, FlatAttr, IsGeneric, 1 /* Uses HostServices */);
+}
+
+/// Emit structure descriptor for a kernel
+void CGOpenMPRuntimeAMDGCN::emitStructureKernelDesc(CodeGenModule &CGM,
+                                                    StringRef Name,
+                                                    int16_t WG_Size,
+                                                    int8_t Mode,
+                                                    int8_t HostServices) {
+
+  // Create all device images
+  llvm::Constant *AttrData[] = {
+      llvm::ConstantInt::get(CGM.Int16Ty, 2), // Version
+      llvm::ConstantInt::get(CGM.Int16Ty, 9), // Size in bytes
+      llvm::ConstantInt::get(CGM.Int16Ty, WG_Size),
+      llvm::ConstantInt::get(CGM.Int8Ty, Mode), // 0 => SPMD, 1 => GENERIC
+      llvm::ConstantInt::get(CGM.Int8Ty, HostServices) // 1 => use HostServices
+  };
+
+  llvm::GlobalVariable *AttrImages =
+      clang::CodeGen::CodeGenUtil::createGlobalStruct(
+          CGM, getTgtAttributeStructQTy(), isDefaultLocationConstant(),
+          AttrData, Twine(Name, "_kern_desc"),
+          llvm::GlobalValue::WeakAnyLinkage);
+
+  CGM.addCompilerUsedGlobal(AttrImages);
+}
+
+// Create Tgt Attribute Struct type.
+QualType CGOpenMPRuntimeAMDGCN::getTgtAttributeStructQTy() {
+  ASTContext &C = CGM.getContext();
+  QualType KmpInt8Ty = C.getIntTypeForBitwidth(/*Width=*/8, /*Signed=*/1);
+  QualType KmpInt16Ty = C.getIntTypeForBitwidth(/*Width=*/16, /*Signed=*/1);
+  if (TgtAttributeStructQTy.isNull()) {
+    RecordDecl *RD = C.buildImplicitRecord("__tgt_attribute_struct");
+    RD->startDefinition();
+    // Version
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt16Ty);
+    // Struct Size in bytes.
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt16Ty);
+    // WG_size
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt16Ty);
+    // Mode
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty);
+    // HostServices
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty);
+    RD->completeDefinition();
+    TgtAttributeStructQTy = C.getRecordType(RD);
+  }
+  return TgtAttributeStructQTy;
+}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -33,6 +33,11 @@
     /// Unknown execution mode (orphaned directive).
     EM_Unknown,
   };
+
+protected:
+  /// Linkage type of KernelStaticGlobalized variable
+  llvm::GlobalValue::LinkageTypes KernelStaticGlobalizedLinkage;
+
 private:
   /// Parallel outlined function work for workers to execute.
   llvm::SmallVector<llvm::Function *, 16> Work;
@@ -99,6 +104,7 @@
                           uint64_t Size, int32_t Flags,
                           llvm::GlobalValue::LinkageTypes Linkage) override;
 
+protected:
   /// Emit outlined function specialized for the Fork-Join
   /// programming model for applicable target directives on the NVPTX device.
   /// \param D Directive to emit.
@@ -129,6 +135,7 @@
                       llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
                       const RegionCodeGenTy &CodeGen);
 
+private:
   /// Emit outlined function for 'target' directive on the NVPTX
   /// device.
   /// \param D Directive to emit.
@@ -211,6 +218,22 @@
   /// Get the maximum number of threads in a block of the GPU.
   virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;
 
+  /// Target independent wrapper over target specific emitSPMDKernel()
+  virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                     StringRef ParentName,
+                                     llvm::Function *&OutlinedFn,
+                                     llvm::Constant *&OutlinedFnID,
+                                     bool IsOffloadEntry,
+                                     const RegionCodeGenTy &CodeGen) = 0;
+
+  /// Target independent wrapper over target specific emitNonSPMDKernel()
+  virtual void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                        StringRef ParentName,
+                                        llvm::Function *&OutlinedFn,
+                                        llvm::Constant *&OutlinedFnID,
+                                        bool IsOffloadEntry,
+                                        const RegionCodeGenTy &CodeGen) = 0;
+
   /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
   /// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
   virtual void emitProcBindClause(CodeGenFunction &CGF,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGOpenMPRuntimeGPU.h"
+#include "CGOpenMPRuntimeAMDGCN.h"
 #include "CGOpenMPRuntimeNVPTX.h"
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
@@ -1099,10 +1100,10 @@
   // Reserve place for the globalized memory.
   GlobalizedRecords.emplace_back();
   if (!KernelStaticGlobalized) {
+    auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
     KernelStaticGlobalized = new llvm::GlobalVariable(
         CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::UndefValue::get(CGM.VoidPtrTy),
+        RT.KernelStaticGlobalizedLinkage, llvm::UndefValue::get(CGM.VoidPtrTy),
         "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
         llvm::GlobalValue::NotThreadLocal,
         CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
@@ -1231,10 +1232,10 @@
   // Reserve place for the globalized memory.
   GlobalizedRecords.emplace_back();
   if (!KernelStaticGlobalized) {
+    auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
     KernelStaticGlobalized = new llvm::GlobalVariable(
         CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::UndefValue::get(CGM.VoidPtrTy),
+        RT.KernelStaticGlobalizedLinkage, llvm::UndefValue::get(CGM.VoidPtrTy),
         "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
         llvm::GlobalValue::NotThreadLocal,
         CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
@@ -1478,11 +1479,11 @@
 
   bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
   if (Mode)
-    emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                   CodeGen);
+    emitSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID,
+                          IsOffloadEntry, CodeGen);
   else
-    emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                      CodeGen);
+    emitNonSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID,
+                             IsOffloadEntry, CodeGen);
 
   setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
 }
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -27,6 +27,7 @@
 public:
   explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
 
+private:
   /// Get the GPU warp size.
   llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
 
@@ -35,6 +36,20 @@
 
   /// Get the maximum number of threads in a block of the GPU.
   llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+
+  /// Target independent wrapper over target specific emitSPMDKernel()
+  void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                             StringRef ParentName, llvm::Function *&OutlinedFn,
+                             llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                             const RegionCodeGenTy &CodeGen) override;
+
+  /// Target independent wrapper over target specific emitNonSPMDKernel()
+  void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                StringRef ParentName,
+                                llvm::Function *&OutlinedFn,
+                                llvm::Constant *&OutlinedFnID,
+                                bool IsOffloadEntry,
+                                const RegionCodeGenTy &CodeGen) override;
 };
 
 } // CodeGen namespace.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -30,6 +30,7 @@
     : CGOpenMPRuntimeGPU(CGM) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP NVPTX can only handle device code.");
+  KernelStaticGlobalizedLinkage = llvm::GlobalValue::InternalLinkage;
 }
 
 llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
@@ -54,3 +55,19 @@
       &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
   return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
 }
+
+void CGOpenMPRuntimeNVPTX::emitSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                 CodeGen);
+}
+
+void CGOpenMPRuntimeNVPTX::emitNonSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                    CodeGen);
+}