diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1216,7 +1216,7 @@
     }
 
     CmdArgs.push_back("-include");
-    CmdArgs.push_back("__clang_openmp_math_declares.h");
+    CmdArgs.push_back("__clang_openmp_device_functions.h");
   }
 
   // Add -i* options, and automatically translate to
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -145,8 +145,7 @@
 set(openmp_wrapper_files
   openmp_wrappers/math.h
   openmp_wrappers/cmath
-  openmp_wrappers/__clang_openmp_math.h
-  openmp_wrappers/__clang_openmp_math_declares.h
+  openmp_wrappers/__clang_openmp_device_functions.h
   openmp_wrappers/new
 )
 
diff --git a/clang/lib/Headers/__clang_cuda_cmath.h b/clang/lib/Headers/__clang_cuda_cmath.h
--- a/clang/lib/Headers/__clang_cuda_cmath.h
+++ b/clang/lib/Headers/__clang_cuda_cmath.h
@@ -12,7 +12,9 @@
 #error "This file is for CUDA compilation only."
 #endif
 
+#ifndef _OPENMP
 #include <limits>
+#endif
 
 // CUDA lets us use various std math functions on the device side.  This file
 // works in concert with __clang_cuda_math_forward_declares.h to make this work.
@@ -31,31 +33,15 @@
 // std covers all of the known knowns.
 
 #ifdef _OPENMP
-#define __DEVICE__ static __attribute__((always_inline))
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
 #else
 #define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
 #endif
 
-// For C++ 17 we need to include noexcept attribute to be compatible
-// with the header-defined version. This may be removed once
-// variant is supported.
-#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
-#define __NOEXCEPT noexcept
-#else
-#define __NOEXCEPT
-#endif
-
-#if !(defined(_OPENMP) && defined(__cplusplus))
 __DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
 __DEVICE__ long abs(long __n) { return ::labs(__n); }
 __DEVICE__ float abs(float __x) { return ::fabsf(__x); }
 __DEVICE__ double abs(double __x) { return ::fabs(__x); }
-#endif
-// TODO: remove once variat is supported.
-#if defined(_OPENMP) && defined(__cplusplus)
-__DEVICE__ const float abs(const float __x) { return ::fabsf((float)__x); }
-__DEVICE__ const double abs(const double __x) { return ::fabs((double)__x); }
-#endif
 __DEVICE__ float acos(float __x) { return ::acosf(__x); }
 __DEVICE__ float asin(float __x) { return ::asinf(__x); }
 __DEVICE__ float atan(float __x) { return ::atanf(__x); }
@@ -64,11 +50,9 @@
 __DEVICE__ float cos(float __x) { return ::cosf(__x); }
 __DEVICE__ float cosh(float __x) { return ::coshf(__x); }
 __DEVICE__ float exp(float __x) { return ::expf(__x); }
-__DEVICE__ float fabs(float __x) __NOEXCEPT { return ::fabsf(__x); }
+__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
 __DEVICE__ float floor(float __x) { return ::floorf(__x); }
 __DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
-// TODO: remove when variant is supported
-#ifndef _OPENMP
 __DEVICE__ int fpclassify(float __x) {
   return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                               FP_ZERO, __x);
@@ -77,14 +61,15 @@
   return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                               FP_ZERO, __x);
 }
-#endif
 __DEVICE__ float frexp(float __arg, int *__exp) {
   return ::frexpf(__arg, __exp);
 }
 
 // For inscrutable reasons, the CUDA headers define these functions for us on
-// Windows.
-#ifndef _MSC_VER
+// Windows. For OpenMP we omit these as some old system headers have
+// non-conforming `isinf(float)` and `isnan(float)` implementations that return
+// an `int`. The system versions of these functions should be fine anyway.
+#if !defined(_MSC_VER) && !defined(_OPENMP)
 __DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
 __DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
@@ -161,6 +146,8 @@
 // libdevice doesn't provide an implementation, and we don't want to be in the
 // business of implementing tricky libm functions in this header.
 
+#ifndef _OPENMP
+
 // Now we've defined everything we promised we'd define in
 // __clang_cuda_math_forward_declares.h.  We need to do two additional things to
 // fix up our math functions.
@@ -457,10 +444,7 @@
 using ::remquof;
 using ::rintf;
 using ::roundf;
-// TODO: remove once variant is supported
-#ifndef _OPENMP
 using ::scalblnf;
-#endif
 using ::scalbnf;
 using ::sinf;
 using ::sinhf;
@@ -479,7 +463,8 @@
 } // namespace std
 #endif
 
-#undef __NOEXCEPT
+#endif // _OPENMP
+
 #undef __DEVICE__
 
 #endif
diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h
--- a/clang/lib/Headers/__clang_cuda_device_functions.h
+++ b/clang/lib/Headers/__clang_cuda_device_functions.h
@@ -21,7 +21,7 @@
 // functions and __forceinline__ helps inlining these wrappers at -O1.
 #pragma push_macro("__DEVICE__")
 #ifdef _OPENMP
-#define __DEVICE__ static __attribute__((always_inline))
+#define __DEVICE__ static __attribute__((always_inline, nothrow))
 #else
 #define __DEVICE__ static __device__ __forceinline__
 #endif
diff --git a/clang/lib/Headers/__clang_cuda_math.h b/clang/lib/Headers/__clang_cuda_math.h
--- a/clang/lib/Headers/__clang_cuda_math.h
+++ b/clang/lib/Headers/__clang_cuda_math.h
@@ -23,11 +23,25 @@
 // functions and __forceinline__ helps inlining these wrappers at -O1.
 #pragma push_macro("__DEVICE__")
 #ifdef _OPENMP
-#define __DEVICE__ static __inline__ __attribute__((always_inline))
+#if defined(__cplusplus)
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
+#else
+#define __DEVICE__ static __attribute__((always_inline, nothrow))
+#endif
 #else
 #define __DEVICE__ static __device__ __forceinline__
 #endif
 
+// Specialized version of __DEVICE__ for functions with void return type. Needed
+// because the OpenMP overlay requires constexpr functions here but prior to
+// c++14 void return functions could not be constexpr.
+#pragma push_macro("__DEVICE_VOID__")
+#ifdef _OPENMP && defined(__cplusplus) && __cplusplus < 201402L
+#define __DEVICE_VOID__ static __attribute__((always_inline, nothrow))
+#else
+#define __DEVICE_VOID__ __DEVICE__
+#endif
+
 // libdevice provides fast low precision and slow full-recision implementations
 // for some functions. Which one gets selected depends on
 // __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
@@ -39,17 +53,8 @@
 #define __FAST_OR_SLOW(fast, slow) slow
 #endif
 
-// For C++ 17 we need to include noexcept attribute to be compatible
-// with the header-defined version. This may be removed once
-// variant is supported.
-#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
-#define __NOEXCEPT noexcept
-#else
-#define __NOEXCEPT
-#endif
-
-__DEVICE__ int abs(int __a) __NOEXCEPT { return __nv_abs(__a); }
-__DEVICE__ double fabs(double __a) __NOEXCEPT { return __nv_fabs(__a); }
+__DEVICE__ int abs(int __a) { return __nv_abs(__a); }
+__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); }
 __DEVICE__ double acos(double __a) { return __nv_acos(__a); }
 __DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
 __DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
@@ -104,7 +109,7 @@
 __DEVICE__ float expf(float __a) { return __nv_expf(__a); }
 __DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
 __DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
-__DEVICE__ float fabsf(float __a) __NOEXCEPT { return __nv_fabsf(__a); }
+__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
 __DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
 __DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
 __DEVICE__ double fdivide(double __a, double __b) { return __a / __b; }
@@ -142,15 +147,15 @@
 __DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
 __DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
 #if defined(__LP64__) || defined(_WIN64)
-__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_llabs(__a); };
+__DEVICE__ long labs(long __a) { return __nv_llabs(__a); };
 #else
-__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_abs(__a); };
+__DEVICE__ long labs(long __a) { return __nv_abs(__a); };
 #endif
 __DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
 __DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
 __DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
 __DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
-__DEVICE__ long long llabs(long long __a) __NOEXCEPT { return __nv_llabs(__a); }
+__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); }
 __DEVICE__ long long llmax(long long __a, long long __b) {
   return __nv_llmax(__a, __b);
 }
@@ -270,8 +275,6 @@
 __DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
 __DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
 __DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
-// TODO: remove once variant is supported
-#ifndef _OPENMP
 __DEVICE__ double scalbln(double __a, long __b) {
   if (__b > INT_MAX)
     return __a > 0 ? HUGE_VAL : -HUGE_VAL;
@@ -286,18 +289,17 @@
     return __a > 0 ? 0.f : -0.f;
   return scalbnf(__a, (int)__b);
 }
-#endif
 __DEVICE__ double sin(double __a) { return __nv_sin(__a); }
-__DEVICE__ void sincos(double __a, double *__s, double *__c) {
+__DEVICE_VOID__ void sincos(double __a, double *__s, double *__c) {
   return __nv_sincos(__a, __s, __c);
 }
-__DEVICE__ void sincosf(float __a, float *__s, float *__c) {
+__DEVICE_VOID__ void sincosf(float __a, float *__s, float *__c) {
   return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c);
 }
-__DEVICE__ void sincospi(double __a, double *__s, double *__c) {
+__DEVICE_VOID__ void sincospi(double __a, double *__s, double *__c) {
   return __nv_sincospi(__a, __s, __c);
 }
-__DEVICE__ void sincospif(float __a, float *__s, float *__c) {
+__DEVICE_VOID__ void sincospif(float __a, float *__s, float *__c) {
   return __nv_sincospif(__a, __s, __c);
 }
 __DEVICE__ float sinf(float __a) {
@@ -339,7 +341,7 @@
 __DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
 
 #pragma pop_macro("__DEVICE__")
+#pragma pop_macro("__DEVICE_VOID__")
 #pragma pop_macro("__FAST_OR_SLOW")
-#undef __NOEXCEPT
 
 #endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
diff --git a/clang/lib/Headers/__clang_cuda_math_forward_declares.h b/clang/lib/Headers/__clang_cuda_math_forward_declares.h
--- a/clang/lib/Headers/__clang_cuda_math_forward_declares.h
+++ b/clang/lib/Headers/__clang_cuda_math_forward_declares.h
@@ -20,37 +20,14 @@
 // would preclude the use of our own __device__ overloads for these functions.
 
 #pragma push_macro("__DEVICE__")
-#ifdef _OPENMP
-#define __DEVICE__ static __inline__ __attribute__((always_inline))
-#else
 #define __DEVICE__                                                             \
   static __inline__ __attribute__((always_inline)) __attribute__((device))
-#endif
-
-// For C++ 17 we need to include noexcept attribute to be compatible
-// with the header-defined version. This may be removed once
-// variant is supported.
-#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
-#define __NOEXCEPT noexcept
-#else
-#define __NOEXCEPT
-#endif
 
-#if !(defined(_OPENMP) && defined(__cplusplus))
 __DEVICE__ long abs(long);
 __DEVICE__ long long abs(long long);
 __DEVICE__ double abs(double);
 __DEVICE__ float abs(float);
-#endif
-// While providing the CUDA declarations and definitions for math functions,
-// we may manually define additional functions.
-// TODO: Once variant is supported the additional functions will have
-// to be removed.
-#if defined(_OPENMP) && defined(__cplusplus)
-__DEVICE__ const double abs(const double);
-__DEVICE__ const float abs(const float);
-#endif
-__DEVICE__ int abs(int) __NOEXCEPT;
+__DEVICE__ int abs(int);
 __DEVICE__ double acos(double);
 __DEVICE__ float acos(float);
 __DEVICE__ double acosh(double);
@@ -85,8 +62,8 @@
 __DEVICE__ float exp(float);
 __DEVICE__ double expm1(double);
 __DEVICE__ float expm1(float);
-__DEVICE__ double fabs(double) __NOEXCEPT;
-__DEVICE__ float fabs(float) __NOEXCEPT;
+__DEVICE__ double fabs(double);
+__DEVICE__ float fabs(float);
 __DEVICE__ double fdim(double, double);
 __DEVICE__ float fdim(float, float);
 __DEVICE__ double floor(double);
@@ -136,12 +113,12 @@
 __DEVICE__ bool isnormal(float);
 __DEVICE__ bool isunordered(double, double);
 __DEVICE__ bool isunordered(float, float);
-__DEVICE__ long labs(long) __NOEXCEPT;
+__DEVICE__ long labs(long);
 __DEVICE__ double ldexp(double, int);
 __DEVICE__ float ldexp(float, int);
 __DEVICE__ double lgamma(double);
 __DEVICE__ float lgamma(float);
-__DEVICE__ long long llabs(long long) __NOEXCEPT;
+__DEVICE__ long long llabs(long long);
 __DEVICE__ long long llrint(double);
 __DEVICE__ long long llrint(float);
 __DEVICE__ double log10(double);
@@ -152,9 +129,6 @@
 __DEVICE__ float log2(float);
 __DEVICE__ double logb(double);
 __DEVICE__ float logb(float);
-#if defined(_OPENMP) && defined(__cplusplus)
-__DEVICE__ long double log(long double);
-#endif
 __DEVICE__ double log(double);
 __DEVICE__ float log(float);
 __DEVICE__ long lrint(double);
@@ -302,7 +276,6 @@
 } // namespace std
 #endif
 
-#undef __NOEXCEPT
 #pragma pop_macro("__DEVICE__")
 
 #endif
diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_math_declares.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
rename from clang/lib/Headers/openmp_wrappers/__clang_openmp_math_declares.h
rename to clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
--- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_math_declares.h
+++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
@@ -1,4 +1,4 @@
-/*===---- __clang_openmp_math_declares.h - OpenMP math declares ------------===
+/*===- __clang_openmp_device_functions.h - OpenMP device function declares -===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -7,28 +7,32 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef __CLANG_OPENMP_MATH_DECLARES_H__
-#define __CLANG_OPENMP_MATH_DECLARES_H__
+#ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
+#define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
 
 #ifndef _OPENMP
 #error "This file is for OpenMP compilation only."
 #endif
 
-#if defined(__NVPTX__) && defined(_OPENMP)
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
 
-#define __CUDA__
-
-#if defined(__cplusplus)
-  #include <__clang_cuda_math_forward_declares.h>
+#ifdef __cplusplus
+extern "C" {
 #endif
 
+#define __CUDA__
 /// Include declarations for libdevice functions.
 #include <__clang_cuda_libdevice_declares.h>
+
 /// Provide definitions for these functions.
 #include <__clang_cuda_device_functions.h>
-#include <__clang_cuda_math.h>
-
 #undef __CUDA__
 
+#ifdef __cplusplus
+} // extern "C"
 #endif
+
+#pragma omp end declare variant
+
 #endif
diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_math.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_math.h
deleted file mode 100644
--- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_math.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*===---- __clang_openmp_math.h - OpenMP target math support ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if defined(__NVPTX__) && defined(_OPENMP)
-/// TODO:
-/// We are currently reusing the functionality of the Clang-CUDA code path
-/// as an alternative to the host declarations provided by math.h and cmath.
-/// This is suboptimal.
-///
-/// We should instead declare the device functions in a similar way, e.g.,
-/// through OpenMP 5.0 variants, and afterwards populate the module with the
-/// host declarations by unconditionally including the host math.h or cmath,
-/// respectively. This is actually what the Clang-CUDA code path does, using
-/// __device__ instead of variants to avoid redeclarations and get the desired
-/// overload resolution.
-
-#define __CUDA__
-
-#if defined(__cplusplus)
-  #include <__clang_cuda_cmath.h>
-#endif
-
-#undef __CUDA__
-
-/// Magic macro for stopping the math.h/cmath host header from being included.
-#define __CLANG_NO_HOST_MATH__
-
-#endif
-
diff --git a/clang/lib/Headers/openmp_wrappers/cmath b/clang/lib/Headers/openmp_wrappers/cmath
--- a/clang/lib/Headers/openmp_wrappers/cmath
+++ b/clang/lib/Headers/openmp_wrappers/cmath
@@ -1,4 +1,4 @@
-/*===-------------- cmath - Alternative cmath header -----------------------===
+/*===-- __clang_openmp_device_functions.h - OpenMP math declares ------ c++ -===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -7,10 +7,67 @@
  *===-----------------------------------------------------------------------===
  */
 
-#include <__clang_openmp_math.h>
+#ifndef __CLANG_OPENMP_CMATH_H__
+#define __CLANG_OPENMP_CMATH_H__
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
 
-#ifndef __CLANG_NO_HOST_MATH__
 #include_next <cmath>
-#else
-#undef __CLANG_NO_HOST_MATH__
+
+// Make sure we include our math.h overlay, it probably happend already but we
+// need to be sure.
+#include <math.h>
+
+// We (might) need cstdlib because __clang_cuda_cmath.h below declares `abs`
+// which might live in cstdlib.
+#include <cstdlib>
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+#define __CUDA__
+#include <__clang_cuda_cmath.h>
+#undef __CUDA__
+
+// Overloads not provided by the CUDA wrappers but by the CUDA system headers.
+// Since we do not include the latter we define them ourselves.
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
+
+__DEVICE__ float acosh(float __x) { return ::acoshf(__x); }
+__DEVICE__ float asinh(float __x) { return ::asinhf(__x); }
+__DEVICE__ float atanh(float __x) { return ::atanhf(__x); }
+__DEVICE__ float cbrt(float __x) { return ::cbrtf(__x); }
+__DEVICE__ float erf(float __x) { return ::erff(__x); }
+__DEVICE__ float erfc(float __x) { return ::erfcf(__x); }
+__DEVICE__ float exp2(float __x) { return ::exp2f(__x); }
+__DEVICE__ float expm1(float __x) { return ::expm1f(__x); }
+__DEVICE__ float fdim(float __x, float __y) { return ::fdimf(__x, __y); }
+__DEVICE__ float hypot(float __x, float __y) { return ::hypotf(__x, __y); }
+__DEVICE__ int ilogb(float __x) { return ::ilogbf(__x); }
+__DEVICE__ float lgamma(float __x) { return ::lgammaf(__x); }
+__DEVICE__ long long int llrint(float __x) { return ::llrintf(__x); }
+__DEVICE__ long long int llround(float __x) { return ::llroundf(__x); }
+__DEVICE__ float log1p(float __x) { return ::log1pf(__x); }
+__DEVICE__ float log2(float __x) { return ::log2f(__x); }
+__DEVICE__ float logb(float __x) { return ::logbf(__x); }
+__DEVICE__ long int lrint(float __x) { return ::lrintf(__x); }
+__DEVICE__ long int lround(float __x) { return ::lroundf(__x); }
+__DEVICE__ float nextafter(float __x, float __y) {
+  return ::nextafterf(__x, __y);
+}
+__DEVICE__ float remainder(float __x, float __y) {
+  return ::remainderf(__x, __y);
+}
+__DEVICE__ float scalbln(float __x, long int __y) {
+  return ::scalblnf(__x, __y);
+}
+__DEVICE__ float scalbn(float __x, int __y) { return ::scalbnf(__x, __y); }
+__DEVICE__ float tgamma(float __x) { return ::tgammaf(__x); }
+
+#undef __DEVICE__
+
+#pragma omp end declare variant
+
 #endif
diff --git a/clang/lib/Headers/openmp_wrappers/math.h b/clang/lib/Headers/openmp_wrappers/math.h
--- a/clang/lib/Headers/openmp_wrappers/math.h
+++ b/clang/lib/Headers/openmp_wrappers/math.h
@@ -1,4 +1,4 @@
-/*===------------- math.h - Alternative math.h header ----------------------===
+/*===---- openmp_wrapper/math.h -------- OpenMP math.h intercept ------ c++ -===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -7,11 +7,30 @@
  *===-----------------------------------------------------------------------===
  */
 
-#include <__clang_openmp_math.h>
+#ifndef __CLANG_OPENMP_MATH_H__
+#define __CLANG_OPENMP_MATH_H__
 
-#ifndef __CLANG_NO_HOST_MATH__
-#include_next <math.h>
-#else
-#undef __CLANG_NO_HOST_MATH__
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
 #endif
 
+#include_next <math.h>
+
+// We need limits.h for __clang_cuda_math.h below and because it should not hurt
+// we include it eagerly here.
+#include <limits.h>
+
+// We need stdlib.h because (for now) __clang_cuda_math.h below declares `abs`
+// which should live in stdlib.h.
+#include <stdlib.h>
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+#define __CUDA__
+#include <__clang_cuda_math.h>
+#undef __CUDA__
+
+#pragma omp end declare variant
+
+#endif
diff --git a/clang/lib/Headers/openmp_wrappers/time.h b/clang/lib/Headers/openmp_wrappers/time.h
new file mode 100644
--- /dev/null
+++ b/clang/lib/Headers/openmp_wrappers/time.h
@@ -0,0 +1,32 @@
+/*===---- time.h - OpenMP time header wrapper ------------------------ c ---===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_OPENMP_TIME_H__
+#define __CLANG_OPENMP_TIME_H__
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
+
+#if defined(__cplusplus)
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
+#else
+#define __DEVICE__ static __attribute__((always_inline, nothrow))
+#endif
+
+#include_next <time.h>
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+__DEVICE__ clock_t clock() { return __nvvm_read_ptx_sreg_clock(); }
+
+#pragma omp end declare variant
+
+#endif
diff --git a/clang/test/Headers/Inputs/include/climits b/clang/test/Headers/Inputs/include/climits
new file mode 100644
--- /dev/null
+++ b/clang/test/Headers/Inputs/include/climits
@@ -0,0 +1,4 @@
+#pragma once
+
+#define INT_MIN  -2147483648
+#define INT_MAX   2147483647
diff --git a/clang/test/Headers/Inputs/include/cmath b/clang/test/Headers/Inputs/include/cmath
--- a/clang/test/Headers/Inputs/include/cmath
+++ b/clang/test/Headers/Inputs/include/cmath
@@ -1,5 +1,227 @@
 #pragma once
 
-double sqrt(double);
+// __clang_cuda_(c)math(.h) also provide `abs` which actually belong in
+// cstdlib. We could split them out but for now we just include cstdlib from
+// cmath.h which is what the systems I've seen do as well.
+#include <cstdlib>
+
+#include <math.h>
+
+double acos(double);
+float acos(float);
+double acosh(double);
+float acosh(float);
+double asin(double);
+float asin(float);
+double asinh(double);
+float asinh(float);
+double atan2(double, double);
+float atan2(float, float);
+double atan(double);
+float atan(float);
+double atanh(double);
+float atanh(float);
+double cbrt(double);
+float cbrt(float);
+double ceil(double);
+float ceil(float);
+double copysign(double, double);
+float copysign(float, float);
+double cos(double);
+float cos(float);
+double cosh(double);
+float cosh(float);
+double erfc(double);
+float erfc(float);
+double erf(double);
+float erf(float);
+double exp2(double);
+float exp2(float);
+double exp(double);
+float exp(float);
+double expm1(double);
+float expm1(float);
+double fdim(double, double);
+float fdim(float, float);
+double floor(double);
+float floor(float);
+double fma(double, double, double);
+float fma(float, float, float);
+double fmax(double, double);
+float fmax(float, float);
+double fmin(double, double);
+float fmin(float, float);
+double fmod(double, double);
+float fmod(float, float);
+int fpclassify(double);
+int fpclassify(float);
+double frexp(double, int *);
+float frexp(float, int *);
+double hypot(double, double);
+float hypot(float, float);
+int ilogb(double);
+int ilogb(float);
+bool isfinite(long double);
+bool isfinite(double);
+bool isfinite(float);
+bool isgreater(double, double);
+bool isgreaterequal(double, double);
+bool isgreaterequal(float, float);
+bool isgreater(float, float);
+bool isinf(long double);
+bool isinf(double);
+bool isinf(float);
+bool isless(double, double);
+bool islessequal(double, double);
+bool islessequal(float, float);
+bool isless(float, float);
+bool islessgreater(double, double);
+bool islessgreater(float, float);
+bool isnan(long double);
+bool isnan(double);
+bool isnan(float);
+bool isnormal(double);
+bool isnormal(float);
+bool isunordered(double, double);
+bool isunordered(float, float);
+double ldexp(double, int);
+float ldexp(float, int);
+double lgamma(double);
+float lgamma(float);
+long long llrint(double);
+long long llrint(float);
+double log10(double);
+float log10(float);
+double log1p(double);
+float log1p(float);
+double log2(double);
+float log2(float);
+double logb(double);
+float logb(float);
+double log(double);
+float log(float);
+long lrint(double);
+long lrint(float);
+long lround(double);
+long lround(float);
+long long llround(float); // No llround(double).
+double modf(double, double *);
+float modf(float, float *);
+double nan(const char *);
+float nanf(const char *);
+double nearbyint(double);
+float nearbyint(float);
+double nextafter(double, double);
+float nextafter(float, float);
 double pow(double, double);
-double modf(double, double*);
+double pow(double, int);
+float pow(float, float);
+float pow(float, int);
+double remainder(double, double);
+float remainder(float, float);
+double remquo(double, double, int *);
+float remquo(float, float, int *);
+double rint(double);
+float rint(float);
+double round(double);
+float round(float);
+double scalbln(double, long);
+float scalbln(float, long);
+double scalbn(double, int);
+float scalbn(float, int);
+bool signbit(double);
+bool signbit(float);
+long double sin(long double);
+double sin(double);
+float sin(float);
+double sinh(double);
+float sinh(float);
+double sqrt(double);
+float sqrt(float);
+double tan(double);
+float tan(float);
+double tanh(double);
+float tanh(float);
+double tgamma(double);
+float tgamma(float);
+double trunc(double);
+float trunc(float);
+
+namespace std {
+
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isinf;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnan;
+using ::isnormal;
+using ::isunordered;
+using ::ldexp;
+using ::lgamma;
+using ::llrint;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::llround;
+using ::modf;
+using ::nan;
+using ::nanf;
+using ::nearbyint;
+using ::nextafter;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+} // namespace std
+
+#define FP_NAN 0
+#define FP_INFINITE 1
+#define FP_ZERO 2
+#define FP_SUBNORMAL 3
+#define FP_NORMAL 4
diff --git a/clang/test/Headers/Inputs/include/cstdlib b/clang/test/Headers/Inputs/include/cstdlib
--- a/clang/test/Headers/Inputs/include/cstdlib
+++ b/clang/test/Headers/Inputs/include/cstdlib
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <stdlib.h>
+
 #if __cplusplus >= 201703L
 extern int abs (int __x) throw()  __attribute__ ((__const__)) ;
 extern long int labs (long int __x) throw() __attribute__ ((__const__)) ;
@@ -20,4 +22,6 @@
 
 inline long long
 abs(long long __x) { return __builtin_llabs (__x); }
+
+float fabs(float __x) { return __builtin_fabs(__x); }
 }
diff --git a/clang/test/Headers/Inputs/include/math.h b/clang/test/Headers/Inputs/include/math.h
--- a/clang/test/Headers/Inputs/include/math.h
+++ b/clang/test/Headers/Inputs/include/math.h
@@ -1,5 +1,199 @@
 #pragma once
 
-double sqrt(double);
-double pow(double, double);
-double modf(double, double*);
+// __clang_cuda_(c)math(.h) also provide `abs` which actually belong in
+// cstdlib. We could split them out but for now we just include cstdlib from
+// cmath.h which is what the systems I've seen do as well.
+#include <stdlib.h>
+
+double fabs(double __a);
+double acos(double __a);
+float acosf(float __a);
+double acosh(double __a);
+float acoshf(float __a);
+double asin(double __a);
+float asinf(float __a);
+double asinh(double __a);
+float asinhf(float __a);
+double atan(double __a);
+double atan2(double __a, double __b);
+float atan2f(float __a, float __b);
+float atanf(float __a);
+double atanh(double __a);
+float atanhf(float __a);
+double cbrt(double __a);
+float cbrtf(float __a);
+double ceil(double __a);
+float ceilf(float __a);
+double copysign(double __a, double __b);
+float copysignf(float __a, float __b);
+double cos(double __a);
+float cosf(float __a);
+double cosh(double __a);
+float coshf(float __a);
+double cospi(double __a);
+float cospif(float __a);
+double cyl_bessel_i0(double __a);
+float cyl_bessel_i0f(float __a);
+double cyl_bessel_i1(double __a);
+float cyl_bessel_i1f(float __a);
+double erf(double __a);
+double erfc(double __a);
+float erfcf(float __a);
+double erfcinv(double __a);
+float erfcinvf(float __a);
+double erfcx(double __a);
+float erfcxf(float __a);
+float erff(float __a);
+double erfinv(double __a);
+float erfinvf(float __a);
+double exp(double __a);
+double exp10(double __a);
+float exp10f(float __a);
+double exp2(double __a);
+float exp2f(float __a);
+float expf(float __a);
+double expm1(double __a);
+float expm1f(float __a);
+float fabsf(float __a);
+double fdim(double __a, double __b);
+float fdimf(float __a, float __b);
+double fdivide(double __a, double __b);
+float fdividef(float __a, float __b);
+double floor(double __f);
+float floorf(float __f);
+double fma(double __a, double __b, double __c);
+float fmaf(float __a, float __b, float __c);
+double fmax(double __a, double __b);
+float fmaxf(float __a, float __b);
+double fmin(double __a, double __b);
+float fminf(float __a, float __b);
+double fmod(double __a, double __b);
+float fmodf(float __a, float __b);
+double frexp(double __a, int *__b);
+float frexpf(float __a, int *__b);
+double hypot(double __a, double __b);
+float hypotf(float __a, float __b);
+int ilogb(double __a);
+int ilogbf(float __a);
+double j0(double __a);
+float j0f(float __a);
+double j1(double __a);
+float j1f(float __a);
+double jn(int __n, double __a);
+float jnf(int __n, float __a);
+double ldexp(double __a, int __b);
+float ldexpf(float __a, int __b);
+double lgamma(double __a);
+float lgammaf(float __a);
+long long llmax(long long __a, long long __b);
+long long llmin(long long __a, long long __b);
+long long llrint(double __a);
+long long llrintf(float __a);
+long long llround(double __a);
+long long llroundf(float __a);
+double log(double __a);
+double log10(double __a);
+float log10f(float __a);
+double log1p(double __a);
+float log1pf(float __a);
+double log2(double __a);
+float log2f(float __a);
+double logb(double __a);
+float logbf(float __a);
+float logf(float __a);
+long lrint(double __a);
+long lrintf(float __a);
+long lround(double __a);
+long lroundf(float __a);
+int max(int __a, int __b);
+int min(int __a, int __b);
+double modf(double __a, double *__b);
+float modff(float __a, float *__b);
+double nearbyint(double __a);
+float nearbyintf(float __a);
+double nextafter(double __a, double __b);
+float nextafterf(float __a, float __b);
+double norm(int __dim, const double *__t);
+double norm3d(double __a, double __b, double __c);
+float norm3df(float __a, float __b, float __c);
+double norm4d(double __a, double __b, double __c, double __d);
+float norm4df(float __a, float __b, float __c, float __d);
+double normcdf(double __a);
+float normcdff(float __a);
+double normcdfinv(double __a);
+float normcdfinvf(float __a);
+float normf(int __dim, const float *__t);
+double pow(double __a, double __b);
+float powf(float __a, float __b);
+double powi(double __a, int __b);
+float powif(float __a, int __b);
+double rcbrt(double __a);
+float rcbrtf(float __a);
+double remainder(double __a, double __b);
+float remainderf(float __a, float __b);
+double remquo(double __a, double __b, int *__c);
+float remquof(float __a, float __b, int *__c);
+double rhypot(double __a, double __b);
+float rhypotf(float __a, float __b);
+double rint(double __a);
+float rintf(float __a);
+double rnorm(int __a, const double *__b);
+double rnorm3d(double __a, double __b, double __c);
+float rnorm3df(float __a, float __b, float __c);
+double rnorm4d(double __a, double __b, double __c, double __d);
+float rnorm4df(float __a, float __b, float __c, float __d);
+float rnormf(int __dim, const float *__t);
+double round(double __a);
+float roundf(float __a);
+double rsqrt(double __a);
+float rsqrtf(float __a);
+double scalbn(double __a, int __b);
+float scalbnf(float __a, int __b);
+double scalbln(double __a, long __b);
+float scalblnf(float __a, long __b);
+double sin(double __a);
+void sincos(double __a, double *__s, double *__c);
+void sincosf(float __a, float *__s, float *__c);
+void sincospi(double __a, double *__s, double *__c);
+void sincospif(float __a, float *__s, float *__c);
+float sinf(float __a);
+double sinh(double __a);
+float sinhf(float __a);
+double sinpi(double __a);
+float sinpif(float __a);
+double sqrt(double __a);
+float sqrtf(float __a);
+double tan(double __a);
+float tanf(float __a);
+double tanh(double __a);
+float tanhf(float __a);
+double tgamma(double __a);
+float tgammaf(float __a);
+double trunc(double __a);
+float truncf(float __a);
+unsigned long long ullmax(unsigned long long __a,
+                          unsigned long long __b);
+unsigned long long ullmin(unsigned long long __a,
+                          unsigned long long __b);
+unsigned int umax(unsigned int __a, unsigned int __b);
+unsigned int umin(unsigned int __a, unsigned int __b);
+double y0(double __a);
+float y0f(float __a);
+double y1(double __a);
+float y1f(float __a);
+double yn(int __a, double __b);
+float ynf(int __a, float __b);
+
+/**
+ * A positive float constant expression. HUGE_VALF evaluates
+ * to +infinity. Used as an error value returned by the built-in
+ * math functions.
+ */
+#define HUGE_VALF (__builtin_huge_valf())
+
+/**
+ * A positive double constant expression. HUGE_VAL evaluates
+ * to +infinity. Used as an error value returned by the built-in
+ * math functions.
+ */
+#define HUGE_VAL (__builtin_huge_val())
diff --git a/clang/test/Headers/Inputs/include/stdlib.h b/clang/test/Headers/Inputs/include/stdlib.h
--- a/clang/test/Headers/Inputs/include/stdlib.h
+++ b/clang/test/Headers/Inputs/include/stdlib.h
@@ -1,2 +1,6 @@
 #pragma once
 typedef __SIZE_TYPE__ size_t;
+
+#ifndef __cplusplus
+extern int abs(int __x) __attribute__((__const__));
+#endif
diff --git a/clang/test/Headers/nvptx_device_cmath_functions.c b/clang/test/Headers/nvptx_device_cmath_functions.c
--- a/clang/test/Headers/nvptx_device_cmath_functions.c
+++ b/clang/test/Headers/nvptx_device_cmath_functions.c
@@ -3,10 +3,11 @@
 
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include cmath -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math_declares.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include cmath -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
 
-#include <cmath>
+#include <stdlib.h>
+#include <math.h>
 
 void test_sqrt(double a1) {
   #pragma omp target
diff --git a/clang/test/Headers/nvptx_device_cmath_functions.cpp b/clang/test/Headers/nvptx_device_cmath_functions.cpp
--- a/clang/test/Headers/nvptx_device_cmath_functions.cpp
+++ b/clang/test/Headers/nvptx_device_cmath_functions.cpp
@@ -3,8 +3,8 @@
 
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include cmath -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math_declares.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include cmath -internal-isystem %S/Inputs/include -include stdlib.h -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
 
 #include <cmath>
 #include <cstdlib>
diff --git a/clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp b/clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp
--- a/clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp
+++ b/clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp
@@ -3,8 +3,8 @@
 
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include cmath -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -std=c++17
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math_declares.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include cmath -internal-isystem %S/Inputs/include -include stdlib.h -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -std=c++17 -o - | FileCheck -check-prefix CHECK-YES %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -std=c++17
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -std=c++17 -o - | FileCheck -check-prefix CHECK-YES %s
 
 #include <cmath>
 #include <cstdlib>
diff --git a/clang/test/Headers/nvptx_device_math_complex.c b/clang/test/Headers/nvptx_device_math_complex.c
new file mode 100644
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_complex.c
@@ -0,0 +1,23 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+// CHECK-DAG: call { float, float } @__divsc3(
+// CHECK-DAG: call { float, float } @__mulsc3(
+void test_scmplx(float _Complex a) {
+#pragma omp target
+  {
+    (void)(a * (a / a));
+  }
+}
+
+
+// CHECK-DAG: call { double, double } @__divdc3(
+// CHECK-DAG: call { double, double } @__muldc3(
+void test_dcmplx(double _Complex a) {
+#pragma omp target
+  {
+    (void)(a * (a / a));
+  }
+}
diff --git a/clang/test/Headers/nvptx_device_math_functions.c b/clang/test/Headers/nvptx_device_math_functions.c
--- a/clang/test/Headers/nvptx_device_math_functions.c
+++ b/clang/test/Headers/nvptx_device_math_functions.c
@@ -3,23 +3,31 @@
 
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include math.h -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math_declares.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include math.h -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
+// RUN: %clang_cc1 -x c -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -x c -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
 
+#ifdef __cplusplus
+#include <cstdlib>
+#include <cmath>
+#else
+#include <stdlib.h>
 #include <math.h>
+#endif
 
 void test_sqrt(double a1) {
   #pragma omp target
   {
-    // CHECK-YES: call double @__nv_sqrt(double
+    // CHECK: call double @__nv_sqrt(double
     double l1 = sqrt(a1);
-    // CHECK-YES: call double @__nv_pow(double
+    // CHECK: call double @__nv_pow(double
     double l2 = pow(a1, a1);
-    // CHECK-YES: call double @__nv_modf(double
+    // CHECK: call double @__nv_modf(double
     double l3 = modf(a1 + 3.5, &a1);
-    // CHECK-YES: call double @__nv_fabs(double
+    // CHECK: call double @__nv_fabs(double
     double l4 = fabs(a1);
-    // CHECK-YES: call i32 @__nv_abs(i32
+    // CHECK: call i32 @__nv_abs(i32
     double l5 = abs((int)a1);
   }
 }
diff --git a/clang/test/Headers/nvptx_device_math_functions.cpp b/clang/test/Headers/nvptx_device_math_functions.cpp
--- a/clang/test/Headers/nvptx_device_math_functions.cpp
+++ b/clang/test/Headers/nvptx_device_math_functions.cpp
@@ -3,11 +3,11 @@
 
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include math.h -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math_declares.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include math.h -internal-isystem %S/Inputs/include -include stdlib.h -include limits -include cstdlib -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
 
 #include <cstdlib>
-#include <math.h>
+#include <cmath>
 
 void test_sqrt(double a1) {
   #pragma omp target
diff --git a/clang/test/Headers/nvptx_device_math_functions_cxx17.cpp b/clang/test/Headers/nvptx_device_math_functions_cxx17.cpp
--- a/clang/test/Headers/nvptx_device_math_functions_cxx17.cpp
+++ b/clang/test/Headers/nvptx_device_math_functions_cxx17.cpp
@@ -3,11 +3,11 @@
 
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include math.h -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -std=c++17
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math_declares.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include math.h -internal-isystem %S/Inputs/include -include stdlib.h -include limits -include cstdlib -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -std=c++17 -o - | FileCheck -check-prefix CHECK-YES %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -std=c++17
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -std=c++17 -o - | FileCheck -check-prefix CHECK-YES %s
 
 #include <cstdlib>
-#include <math.h>
+#include <cmath>
 
 void test_sqrt(double a1) {
   #pragma omp target
diff --git a/clang/test/Headers/nvptx_device_math_macro.cpp b/clang/test/Headers/nvptx_device_math_macro.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_macro.cpp
@@ -0,0 +1,17 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+#include <cmath>
+
+#pragma omp declare target
+int use_macro() {
+  double a(0);
+// CHECK-NOT:  call
+// CHECK:  call double @llvm.fabs.f64(double
+// CHECK-NOT:  call
+// CHECK:      ret i32 %conv
+  return (std::fpclassify(a) != FP_ZERO);
+}
+#pragma omp end declare target
diff --git a/clang/test/Headers/nvptx_device_math_modf.cpp b/clang/test/Headers/nvptx_device_math_modf.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_modf.cpp
@@ -0,0 +1,53 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+
+#include <cmath>
+
+// 4 calls to modf(f), all translated to __nv_modf calls:
+
+// CHECK-NOT: _Z.modf
+// CHECK: call double @__nv_modf(double
+// CHECK-NOT: _Z.modf
+// CHECK: call float @__nv_modff(float
+// CHECK-NOT: _Z.modf
+// CHECK: call double @__nv_modf(double
+// CHECK-NOT: _Z.modf
+// CHECK: call float @__nv_modff(float
+// CHECK-NOT: _Z.modf
+
+template<typename T>
+void test_modf(T x)
+{
+  T dx;
+  int intx;
+
+  #pragma omp target map(from: intx, dx)
+  {
+    T ipart;
+    dx = std::modf(x, &ipart);
+    intx = static_cast<int>(ipart);
+  }
+}
+
+int main()
+{
+
+#if !defined(C_ONLY)
+  test_modf<double>(1.0);
+  test_modf<float>(1.0);
+#endif
+
+  #pragma omp target
+  {
+    double intpart, res;
+    res = modf(1.1, &intpart);
+  }
+
+  #pragma omp target
+  {
+    float intpart, res;
+    res = modff(1.1f, &intpart);
+  }
+
+}
diff --git a/clang/test/Headers/nvptx_device_math_sin.c b/clang/test/Headers/nvptx_device_math_sin.c
new file mode 100644
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_sin.c
@@ -0,0 +1,27 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -x c -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -x c -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=SLOW
+// RUN: %clang_cc1 -x c -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math
+// RUN: %clang_cc1 -x c -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math | FileCheck %s --check-prefix=FAST
+// expected-no-diagnostics
+
+#include <math.h>
+
+double math(float f, double d, long double ld) {
+  double r = 0;
+// SLOW:  call float @__nv_sinf(float
+// FAST:  call fast float @__nv_fast_sinf(float
+  r += sinf(f);
+// SLOW:  call double @__nv_sin(double
+// FAST:  call fast double @__nv_sin(double
+  r += sin(d);
+  return r;
+}
+
+long double foo(float f, double d, long double ld) {
+  double r = ld;
+  r += math(f, d, ld);
+#pragma omp target map(r)
+  { r += math(f, d, ld); }
+  return r;
+}
diff --git a/clang/test/Headers/nvptx_device_math_sin.cpp b/clang/test/Headers/nvptx_device_math_sin.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_sin.cpp
@@ -0,0 +1,27 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=SLOW
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math
+// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math | FileCheck %s --check-prefix=FAST
+// expected-no-diagnostics
+
+#include <cmath>
+
+double math(float f, double d, long double ld) {
+  double r = 0;
+// SLOW:  call float @__nv_sinf(float
+// FAST:  call fast float @__nv_fast_sinf(float
+  r += sin(f);
+// SLOW:  call double @__nv_sin(double
+// FAST:  call fast double @__nv_sin(double
+  r += sin(d);
+  return r;
+}
+
+long double foo(float f, double d, long double ld) {
+  double r = ld;
+  r += math(f, d, ld);
+#pragma omp target map(r)
+  { r += math(f, d, ld); }
+  return r;
+}
diff --git a/clang/test/Headers/nvptx_device_math_sin_cos.cpp b/clang/test/Headers/nvptx_device_math_sin_cos.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_sin_cos.cpp
@@ -0,0 +1,63 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+
+#include <cmath>
+
+// 6 calls to sin/cos(f), all translated to __nv_sin/__nv_cos calls:
+
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+// CHECK: call double @__nv_sin(double
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+// CHECK: call float @__nv_sinf(float
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+// CHECK: call double @__nv_sin(double
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+// CHECK: call double @__nv_cos(double
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+// CHECK: call float @__nv_sinf(float
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+// CHECK: call float @__nv_cosf(float
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+
+template<typename T>
+void test_sin_cos(T x)
+{
+  T res_sin, res_cos;
+
+  #pragma omp target map(from: res_sin, res_cos)
+  {
+    res_sin = std::sin(x);
+    res_cos = std::cos(x);
+  }
+}
+
+int main()
+{
+
+#if !defined(C_ONLY)
+  test_sin_cos<double>(0.0);
+  test_sin_cos<float>(0.0);
+#endif
+
+  #pragma omp target
+  {
+    double res;
+    res = sin(1.0);
+  }
+
+  #pragma omp target
+  {
+    float res;
+    res = sinf(1.0f);
+  }
+
+  return 0;
+}
diff --git a/clang/test/Headers/nvptx_device_math_sincos.cpp b/clang/test/Headers/nvptx_device_math_sincos.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_sincos.cpp
@@ -0,0 +1,58 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+
+#include <cmath>
+
+// 4 calls to sincos(f), all translated to __nv_sincos calls:
+
+// CHECK-NOT: _Z.sincos
+// CHECK: call void @__nv_sincos(double
+// CHECK-NOT: _Z.sincos
+// CHECK: call void @__nv_sincosf(float
+// CHECK-NOT: _Z.sincos
+// CHECK: call void @__nv_sincos(double
+// CHECK-NOT: _Z.sincos
+// CHECK: call void @__nv_sincosf(float
+// CHECK-NOT: _Z.sincos
+
+// single precision wrapper
+inline void sincos(float x, float* __restrict__ sin, float* __restrict__ cos)
+{
+  sincosf(x, sin, cos);
+}
+
+template<typename T>
+void test_sincos(T x)
+{
+  T res_sin, res_cos;
+
+  #pragma omp target map(from: res_sin, res_cos)
+  {
+    sincos(x, &res_sin, &res_cos);
+  }
+
+}
+
+int main(int argc, char **argv)
+{
+
+#if !defined(C_ONLY)
+  test_sincos<double>(0.0);
+  test_sincos<float>(0.0);
+#endif
+
+  #pragma omp target
+  {
+    double s, c;
+    sincos(0, &s, &c);
+  }
+
+  #pragma omp target
+  {
+    float s, c;
+    sincosf(0.f, &s, &c);
+  }
+
+  return 0;
+}