diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
deleted file mode 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===--- CGOpenMPRuntimeAMDGCN.h - Interface to OpenMP AMDGCN Runtimes ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This provides a class for OpenMP runtime code generation specialized to
-// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
-#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
-
-#include "CGOpenMPRuntime.h"
-#include "CGOpenMPRuntimeGPU.h"
-#include "CodeGenFunction.h"
-#include "clang/AST/StmtOpenMP.h"
-
-namespace clang {
-namespace CodeGen {
-
-class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU {
-
-public:
-  explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
-
-  /// Get the GPU warp size.
-  llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
-
-  /// Get the id of the current thread on the GPU.
-  llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
-};
-
-} // namespace CodeGen
-} // namespace clang
-
-#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
deleted file mode 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//===-- CGOpenMPRuntimeAMDGCN.cpp - Interface to OpenMP AMDGCN Runtimes --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This provides a class for OpenMP runtime code generation specialized to
-// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "CGOpenMPRuntimeAMDGCN.h"
-#include "CGOpenMPRuntimeGPU.h"
-#include "CodeGenFunction.h"
-#include "clang/AST/Attr.h"
-#include "clang/AST/DeclOpenMP.h"
-#include "clang/AST/StmtOpenMP.h"
-#include "clang/AST/StmtVisitor.h"
-#include "clang/Basic/Cuda.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-
-using namespace clang;
-using namespace CodeGen;
-using namespace llvm::omp;
-
-CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM)
-    : CGOpenMPRuntimeGPU(CGM) {
-  if (!CGM.getLangOpts().OpenMPIsDevice)
-    llvm_unreachable("OpenMP AMDGCN can only handle device code.");
-}
-
-llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
-  CGBuilderTy &Bld = CGF.Builder;
-  // return constant compile-time target-specific warp size
-  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
-  return Bld.getInt32(WarpSize);
-}
-
-llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) {
-  CGBuilderTy &Bld = CGF.Builder;
-  llvm::Function *F =
-      CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x);
-  return Bld.CreateCall(F, llvm::None, "nvptx_tid");
-}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -176,10 +176,10 @@
   /// and NVPTX.
 
   /// Get the GPU warp size.
-  virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0;
+  llvm::Value *getGPUWarpSize(CodeGenFunction &CGF);
 
   /// Get the id of the current thread on the GPU.
-  virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0;
+  llvm::Value *getGPUThreadID(CodeGenFunction &CGF);
 
   /// Get the maximum number of threads in a block of the GPU.
   llvm::Value *getGPUNumThreads(CodeGenFunction &CGF);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGOpenMPRuntimeGPU.h"
-#include "CGOpenMPRuntimeNVPTX.h"
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclOpenMP.h"
@@ -21,7 +20,6 @@
 #include "clang/Basic/Cuda.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
-#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/Support/MathExtras.h"
 
 using namespace clang;
@@ -1197,7 +1195,7 @@
 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
     : CGOpenMPRuntime(CGM, "_", "$") {
   if (!CGM.getLangOpts().OpenMPIsDevice)
-    llvm_unreachable("OpenMP NVPTX can only handle device code.");
+    llvm_unreachable("OpenMP can only handle device code.");
 
   llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
   if (CGM.getLangOpts().OpenMPTargetNewRuntime) {
@@ -3960,3 +3958,17 @@
   }
   return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
 }
+
+llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
+  ArrayRef<llvm::Value *> Args{};
+  return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                   CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
+                               Args);
+}
+
+llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
+  ArrayRef<llvm::Value *> Args{};
+  return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                   CGM.getModule(), OMPRTL___kmpc_get_warp_size),
+                               Args);
+}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
deleted file mode 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===----- CGOpenMPRuntimeNVPTX.h - Interface to OpenMP NVPTX Runtimes ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This provides a class for OpenMP runtime code generation specialized to NVPTX
-// targets from generalized CGOpenMPRuntimeGPU class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H
-#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H
-
-#include "CGOpenMPRuntime.h"
-#include "CGOpenMPRuntimeGPU.h"
-#include "CodeGenFunction.h"
-#include "clang/AST/StmtOpenMP.h"
-
-namespace clang {
-namespace CodeGen {
-
-class CGOpenMPRuntimeNVPTX final : public CGOpenMPRuntimeGPU {
-
-public:
-  explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
-
-  /// Get the GPU warp size.
-  llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
-
-  /// Get the id of the current thread on the GPU.
-  llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
-};
-
-} // CodeGen namespace.
-} // clang namespace.
-
-#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
deleted file mode 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This provides a class for OpenMP runtime code generation specialized to NVPTX
-// targets from generalized CGOpenMPRuntimeGPU class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "CGOpenMPRuntimeNVPTX.h"
-#include "CGOpenMPRuntimeGPU.h"
-#include "CodeGenFunction.h"
-#include "clang/AST/Attr.h"
-#include "clang/AST/DeclOpenMP.h"
-#include "clang/AST/StmtOpenMP.h"
-#include "clang/AST/StmtVisitor.h"
-#include "clang/Basic/Cuda.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/IR/IntrinsicsNVPTX.h"
-
-using namespace clang;
-using namespace CodeGen;
-using namespace llvm::omp;
-
-CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
-    : CGOpenMPRuntimeGPU(CGM) {
-  if (!CGM.getLangOpts().OpenMPIsDevice)
-    llvm_unreachable("OpenMP NVPTX can only handle device code.");
-}
-
-llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
-  return CGF.EmitRuntimeCall(
-      llvm::Intrinsic::getDeclaration(
-          &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
-      "nvptx_warp_size");
-}
-
-llvm::Value *CGOpenMPRuntimeNVPTX::getGPUThreadID(CodeGenFunction &CGF) {
-  CGBuilderTy &Bld = CGF.Builder;
-  llvm::Function *F;
-  F = llvm::Intrinsic::getDeclaration(
-      &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
-  return Bld.CreateCall(F, llvm::None, "nvptx_tid");
-}
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -59,9 +59,7 @@
   CGObjCRuntime.cpp
   CGOpenCLRuntime.cpp
   CGOpenMPRuntime.cpp
-  CGOpenMPRuntimeAMDGCN.cpp
   CGOpenMPRuntimeGPU.cpp
-  CGOpenMPRuntimeNVPTX.cpp
   CGRecordLayoutBuilder.cpp
   CGStmt.cpp
   CGStmtOpenMP.cpp
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -19,8 +19,7 @@
 #include "CGObjCRuntime.h"
 #include "CGOpenCLRuntime.h"
 #include "CGOpenMPRuntime.h"
-#include "CGOpenMPRuntimeAMDGCN.h"
-#include "CGOpenMPRuntimeNVPTX.h"
+#include "CGOpenMPRuntimeGPU.h"
 #include "CodeGenFunction.h"
 #include "CodeGenPGO.h"
 #include "ConstantEmitter.h"
@@ -244,14 +243,10 @@
   switch (getTriple().getArch()) {
   case llvm::Triple::nvptx:
   case llvm::Triple::nvptx64:
-    assert(getLangOpts().OpenMPIsDevice &&
-           "OpenMP NVPTX is only prepared to deal with device code.");
-    OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
-    break;
   case llvm::Triple::amdgcn:
     assert(getLangOpts().OpenMPIsDevice &&
-           "OpenMP AMDGCN is only prepared to deal with device code.");
-    OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this));
+           "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
+    OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this));
     break;
   default:
     if (LangOpts.OpenMPSimd)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -455,6 +455,8 @@
 __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
 __OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
 
+__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
+
 __OMP_RTL(__kmpc_is_generic_main_thread_id, false, Int8, Int32)
 
 __OMP_RTL(__last, false, Void, )
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -277,5 +277,10 @@
   FunctionTracingRAII();
   return impl::getNumHardwareThreadsInBlock();
 }
+
+__attribute__((noinline)) uint32_t __kmpc_get_warp_size() {
+  FunctionTracingRAII();
+  return impl::getWarpSize();
+}
 }
 #pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
--- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -24,6 +24,7 @@
 __attribute__((used, weak, optnone)) void keepAlive() {
   __kmpc_get_hardware_thread_id_in_block();
   __kmpc_get_hardware_num_threads_in_block();
+  __kmpc_get_warp_size();
   __kmpc_barrier_simple_spmd(nullptr, 0);
   __kmpc_barrier_simple_generic(nullptr, 0);
 }
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -133,8 +133,11 @@
                            __builtin_amdgcn_workgroup_size_x());
 }
 
+uint32_t __kmpc_get_warp_size() {
+  return WARPSIZE;
+}
+
 EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }
-EXTERN unsigned GetWarpSize() { return WARPSIZE; }
 EXTERN unsigned GetLaneId() {
   return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
 }
diff --git a/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h b/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h
--- a/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h
+++ b/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h
@@ -35,7 +35,7 @@
 ///{
 extern "C" {
 unsigned GetLaneId();
-unsigned GetWarpSize();
+unsigned __kmpc_get_warp_size();
 void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
 uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
 }
@@ -60,7 +60,7 @@
 
 inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
                                      int32_t SrcLane) {
-  int Width = GetWarpSize();
+  int Width = __kmpc_get_warp_size();
   int Self = GetLaneId();
   int Index = SrcLane + (Self & ~(Width - 1));
   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
@@ -90,7 +90,7 @@
 
 inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
                                           uint32_t Delta, int32_t Width) {
-  int32_t T = ((GetWarpSize() - Width) << 8) | 0x1f;
+  int32_t T = ((__kmpc_get_warp_size() - Width) << 8) | 0x1f;
   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
 }
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -105,11 +105,14 @@
 EXTERN unsigned GetWarpId() {
   return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE;
 }
-EXTERN unsigned GetWarpSize() { return WARPSIZE; }
 EXTERN unsigned GetLaneId() {
   return __kmpc_get_hardware_thread_id_in_block() & (WARPSIZE - 1);
 }
 
+unsigned __kmpc_get_warp_size() {
+  return WARPSIZE;
+}
+
 // Atomics
 uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
   return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);