diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1199,6 +1199,18 @@
 
     CmdArgs.push_back("-include");
     CmdArgs.push_back("__clang_openmp_device_functions.h");
+
+    {
+      auto *CTC = static_cast<const toolchains::CudaToolChain *>(
+          C.getSingleOffloadToolChain<Action::OFK_Cuda>());
+      assert(CTC && "Expected valid CUDA Toolchain.");
+      CudaVersion Ver = CTC->CudaInstallation.version();
+      CmdArgs.push_back("-include");
+      const char *Header = (Ver >= CudaVersion::CUDA_90)
+                               ? "__clang_openmp_devicertl_cuda_ge90.h"
+                               : "__clang_openmp_devicertl_cuda_lt90.h";
+      CmdArgs.push_back(Header);
+    }
   }
 
   // Add -i* options, and automatically translate to
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -160,6 +160,8 @@
   openmp_wrappers/complex.h
   openmp_wrappers/complex
   openmp_wrappers/__clang_openmp_device_functions.h
+  openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h
+  openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h
   openmp_wrappers/complex_cmath.h
   openmp_wrappers/new
 )
diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h
new file mode 100644
--- /dev/null
+++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h
@@ -0,0 +1,53 @@
+//===--- __clang_openmp_devicertl_cuda_ge90.h -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef __CLANG_OPENMP_DEVICERTL_CUDA_GE90_H__
+#define __CLANG_OPENMP_DEVICERTL_CUDA_GE90_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#pragma push_macro("DEVICE")
+
+#ifdef _OPENMP
+#define DEVICE __attribute__((used))
+#else
+#define DEVICE __attribute__((used)) __attribute__((device))
+#endif
+
+// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
+inline DEVICE unsigned __kmpc_impl_activemask() {
+  unsigned mask;
+  asm volatile("activemask.b32 %0;" : "=r"(mask));
+  return mask;
+}
+
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+inline DEVICE int __kmpc_impl_shfl_sync(unsigned Mask, int Var, int SrcLane) {
+  int WARPSIZE = 32;
+  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, WARPSIZE - 1);
+}
+
+inline DEVICE int __kmpc_impl_shfl_down_sync(unsigned Mask, int Var,
+                                             unsigned Delta, int Width) {
+  int WARPSIZE = 32;
+  int tmp = ((WARPSIZE - Width) << 8) | 0x1f;
+  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, tmp);
+}
+
+inline DEVICE void __kmpc_impl_syncwarp(unsigned Mask) {
+  __nvvm_bar_warp_sync(Mask);
+}
+
+#pragma pop_macro("DEVICE")
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h
new file mode 100644
--- /dev/null
+++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h
@@ -0,0 +1,52 @@
+//===--- __clang_openmp_devicertl_cuda_lt90.h -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef __CLANG_OPENMP_DEVICERTL_CUDA_LT90_H__
+#define __CLANG_OPENMP_DEVICERTL_CUDA_LT90_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#pragma push_macro("DEVICE")
+
+#ifdef _OPENMP
+#define DEVICE __attribute__((used))
+#else
+#define DEVICE __attribute__((used)) __attribute__((device))
+#endif
+
+// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
+inline DEVICE unsigned __kmpc_impl_activemask() {
+  return __nvvm_vote_ballot(1);
+}
+
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+inline DEVICE int __kmpc_impl_shfl_sync(unsigned Mask, int Var, int SrcLane) {
+  int WARPSIZE = 32;
+  return __nvvm_shfl_idx_i32(Var, SrcLane, WARPSIZE - 1);
+}
+
+inline DEVICE int __kmpc_impl_shfl_down_sync(unsigned Mask, int Var,
+                                             unsigned Delta, int Width) {
+  int WARPSIZE = 32;
+  int tmp = ((WARPSIZE - Width) << 8) | 0x1f;
+  return __nvvm_shfl_down_i32(Var, Delta, tmp);
+}
+
+inline DEVICE void __kmpc_impl_syncwarp(unsigned Mask) {
+  (void)Mask;
+  // In Cuda < 9.0 no need to sync threads in warps.
+}
+
+#pragma pop_macro("DEVICE")
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -85,19 +85,17 @@
 INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
 INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
 
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
 
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
+EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
                                      int32_t SrcLane);
 
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
+EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
                                           uint32_t Delta, int32_t Width);
 
-INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
+EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t);
 
-INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
-  // AMDGCN doesn't need to sync threads in a warp
-}
+INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
 
 // AMDGCN specific kernel initialization
 DEVICE void __kmpc_impl_target_init();
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -46,11 +46,11 @@
 }
 
 // Warp vote function
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
   return __builtin_amdgcn_read_exec();
 }
 
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
+EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
                                      int32_t srcLane) {
   int width = WARPSIZE;
   int self = GetLaneId();
@@ -58,7 +58,7 @@
   return __builtin_amdgcn_ds_bpermute(index << 2, var);
 }
 
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
+EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
                                           uint32_t laneDelta, int32_t width) {
   int self = GetLaneId();
   int index = self + laneDelta;
@@ -66,6 +66,10 @@
   return __builtin_amdgcn_ds_bpermute(index << 2, var);
 }
 
+EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
+  // AMDGCN doesn't need to sync threads in a warp
+}
+
 static DEVICE SHARED uint32_t L1_Barrier;
 
 DEVICE void __kmpc_impl_target_init() {
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -100,17 +100,18 @@
 #error CUDA_VERSION macro is undefined, something wrong with cuda.
 #endif
 
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
 
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
                                      int32_t SrcLane);
 
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
+EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
                                           int32_t Var, uint32_t Delta,
                                           int32_t Width);
 
+EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
+
 DEVICE void __kmpc_impl_syncthreads();
-DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
 
 // NVPTX specific kernel initialization
 DEVICE void __kmpc_impl_target_init();
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -18,17 +18,6 @@
 
 // Forward declaration of CUDA primitives which will be evetually transformed
 // into LLVM intrinsics.
-extern "C" {
-unsigned int __activemask();
-unsigned int __ballot(unsigned);
-// The default argument here is based on NVIDIA's website
-// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
-int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE);
-int __shfl(int val, int src_line, int width = WARPSIZE);
-int __shfl_down(int var, unsigned detla, int width);
-int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
-void __syncwarp(int mask);
-}
 
 DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
@@ -69,45 +58,8 @@
   return (double)nsecs * __kmpc_impl_get_wtick();
 }
 
-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION >= 9000
-  return __activemask();
-#else
-  return __ballot(1);
-#endif
-}
-
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
-                                     int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
-  return __shfl_sync(Mask, Var, SrcLane);
-#else
-  return __shfl(Var, SrcLane);
-#endif // CUDA_VERSION
-}
-
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
-                                          int32_t Var, uint32_t Delta,
-                                          int32_t Width) {
-#if CUDA_VERSION >= 9000
-  return __shfl_down_sync(Mask, Var, Delta, Width);
-#else
-  return __shfl_down(Var, Delta, Width);
-#endif // CUDA_VERSION
-}
-
 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
-DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-#if CUDA_VERSION >= 9000
-  __syncwarp(Mask);
-#else
-  // In Cuda < 9.0 no need to sync threads in warps.
-#endif // CUDA_VERSION
-}
-
 // NVPTX specific kernel initialization
 DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
 }