Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -1204,7 +1204,8 @@
   // If we are offloading to a target via OpenMP we need to include the
   // openmp_wrappers folder which contains alternative system headers.
   if (JA.isDeviceOffloading(Action::OFK_OpenMP) &&
-      getToolChain().getTriple().isNVPTX()){
+      (getToolChain().getTriple().isNVPTX() ||
+       getToolChain().getTriple().isAMDGCN())) {
     if (!Args.hasArg(options::OPT_nobuiltininc)) {
       // Add openmp_wrappers/* to our system include path.  This lets us wrap
       // standard library headers.
Index: clang/lib/Headers/CMakeLists.txt
===================================================================
--- clang/lib/Headers/CMakeLists.txt
+++ clang/lib/Headers/CMakeLists.txt
@@ -160,6 +160,7 @@
   openmp_wrappers/__clang_openmp_device_functions.h
   openmp_wrappers/complex_cmath.h
   openmp_wrappers/new
+  openmp_wrappers/hip/hip_runtime.h
 )
 
 set(output_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include)
Index: clang/lib/Headers/__clang_cuda_complex_builtins.h
===================================================================
--- clang/lib/Headers/__clang_cuda_complex_builtins.h
+++ clang/lib/Headers/__clang_cuda_complex_builtins.h
@@ -27,6 +27,7 @@
 // different but equivalent function versions. TODO: For OpenMP we currently
 // select the native builtins as the overload support for templates is lacking.
 #if !defined(_OPENMP)
+#ifdef __NVPTX__
 #define _ISNANd std::isnan
 #define _ISNANf std::isnan
 #define _ISINFd std::isinf
@@ -41,7 +42,29 @@
 #define _ABSf std::abs
 #define _LOGBd std::logb
 #define _LOGBf std::logb
+#define _fmaxd max
+#define _fmaxf max
 #else
+#include <__clang_hip_libdevice_declares.h>
+#define _ISNANd __ocml_isnan_f64
+#define _ISNANf __ocml_isnan_f32
+#define _ISINFd __ocml_isinf_f64
+#define _ISINFf __ocml_isinf_f32
+#define _ISFINITEd __ocml_isfinite_f64
+#define _ISFINITEf __ocml_isfinite_f32
+#define _COPYSIGNd __ocml_copysign_f64
+#define _COPYSIGNf __ocml_copysign_f32
+#define _SCALBNd __ocml_scalbn_f64
+#define _SCALBNf __ocml_scalbn_f32
+#define _ABSd __ocml_fabs_f64
+#define _ABSf __ocml_fabs_f32
+#define _LOGBd __ocml_logb_f64
+#define _LOGBf __ocml_logb_f32
+#define _fmaxd __ocml_fmax_f64
+#define _fmaxf __ocml_fmax_f32
+#endif
+#else  // OPENMP
+#ifdef __NVPTX__
 #define _ISNANd __nv_isnand
 #define _ISNANf __nv_isnanf
 #define _ISINFd __nv_isinfd
@@ -56,6 +79,28 @@
 #define _ABSf __nv_fabsf
 #define _LOGBd __nv_logb
 #define _LOGBf __nv_logbf
+#define _fmaxd __nv_fmax
+#define _fmaxf __nv_fmaxf
+#else
+// OPENMP and __AMDGCN__
+#include <__clang_hip_libdevice_declares.h>
+#define _ISNANd __ocml_isnan_f64
+#define _ISNANf __ocml_isnan_f32
+#define _ISINFd __ocml_isinf_f64
+#define _ISINFf __ocml_isinf_f32
+#define _ISFINITEd __ocml_isfinite_f64
+#define _ISFINITEf __ocml_isfinite_f32
+#define _COPYSIGNd __ocml_copysign_f64
+#define _COPYSIGNf __ocml_copysign_f32
+#define _SCALBNd __ocml_scalbn_f64
+#define _SCALBNf __ocml_scalbn_f32
+#define _ABSd __ocml_fabs_f64
+#define _ABSf __ocml_fabs_f32
+#define _LOGBd __ocml_logb_f64
+#define _LOGBf __ocml_logb_f32
+#define _fmaxd __ocml_fmax_f64
+#define _fmaxf __ocml_fmax_f32
+#endif
 #endif
 
 #if defined(__cplusplus)
@@ -167,7 +212,7 @@
   // Can't use std::max, because that's defined in <algorithm>, and we don't
   // want to pull that in for every compile.  The CUDA headers define
   // ::max(float, float) and ::max(double, double), which is sufficient for us.
-  double __logbw = _LOGBd(max(_ABSd(__c), _ABSd(__d)));
+  double __logbw = _LOGBd(_fmaxd(_ABSd(__c), _ABSd(__d)));
   if (_ISFINITEd(__logbw)) {
     __ilogbw = (int)__logbw;
     __c = _SCALBNd(__c, -__ilogbw);
@@ -200,7 +245,7 @@
 
 __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
   int __ilogbw = 0;
-  float __logbw = _LOGBf(max(_ABSf(__c), _ABSf(__d)));
+  float __logbw = _LOGBf(_fmaxf(_ABSf(__c), _ABSf(__d)));
   if (_ISFINITEf(__logbw)) {
     __ilogbw = (int)__logbw;
     __c = _SCALBNf(__c, -__ilogbw);
@@ -249,6 +294,8 @@
 #undef _ABSf
 #undef _LOGBd
 #undef _LOGBf
+#undef _fmaxd
+#undef _fmaxf
 
 #ifdef _OPENMP
 #pragma omp end declare target
Index: clang/lib/Headers/__clang_hip_cmath.h
===================================================================
--- clang/lib/Headers/__clang_hip_cmath.h
+++ clang/lib/Headers/__clang_hip_cmath.h
@@ -20,8 +20,15 @@
 #include <limits.h>
 #include <stdint.h>
 
+// __DEVICE__ is a helper macro with common set of attributes for the wrappers
+// we implement in this file. We need static in order to avoid emitting unused
+// functions.
 #pragma push_macro("__DEVICE__")
+#ifdef __OPENMP_AMDGCN__
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
+#else
 #define __DEVICE__ static __device__ inline __attribute__((always_inline))
+#endif
 
 // Start with functions that cannot be defined by DEF macros below.
 #if defined(__cplusplus)
Index: clang/lib/Headers/__clang_hip_libdevice_declares.h
===================================================================
--- clang/lib/Headers/__clang_hip_libdevice_declares.h
+++ clang/lib/Headers/__clang_hip_libdevice_declares.h
@@ -299,8 +299,13 @@
 typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
 typedef short __2i16 __attribute__((ext_vector_type(2)));
 
+#if defined(__cplusplus)
 __device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
                                                      float c, bool s);
+#else
+__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
+                                                     float c, unsigned int s);
+#endif
 __device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
 __device__ __2f16 __ocml_cos_2f16(__2f16);
Index: clang/lib/Headers/__clang_hip_math.h
===================================================================
--- clang/lib/Headers/__clang_hip_math.h
+++ clang/lib/Headers/__clang_hip_math.h
@@ -19,16 +19,36 @@
 #include <limits.h>
 #include <stdint.h>
 
+// __DEVICE__ is a helper macro with common set of attributes for the wrappers
+// we implement in this file. We need static in order to avoid emitting unused
+// functions and __forceinline__ helps inlining these wrappers at -O1.
 #pragma push_macro("__DEVICE__")
+#pragma push_macro("__DEVICE_NOCE__")
+
+#ifdef __OPENMP_AMDGCN__
+#if defined(__cplusplus)
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
+#define __DEVICE_NOCE__ static __attribute__((always_inline, nothrow))
+#else // !defined(__cplusplus), c openmp compilation
+#define __DEVICE__ static __attribute__((always_inline, nothrow))
+#define __DEVICE_NOCE__ __DEVICE__
+#endif
+#else // !__OPENMP_AMDGCN__, so this is for HIP-Clang which is always C++.
 #define __DEVICE__ static __device__ inline __attribute__((always_inline))
+#define __DEVICE_NOCE__ __DEVICE__
+#endif
 
 // A few functions return bool type starting only in C++11.
 #pragma push_macro("__RETURN_TYPE")
+#ifdef __OPENMP_AMDGCN__
+#define __RETURN_TYPE int
+#else
 #if defined(__cplusplus)
 #define __RETURN_TYPE bool
 #else
 #define __RETURN_TYPE int
 #endif
+#endif
 
 #if defined (__cplusplus) && __cplusplus < 201103L
 // emulate static_assert on type sizes
@@ -39,7 +59,12 @@
   static const bool valid;
 };
 
-__DEVICE__
+// All following c-capable function defs have one of two macro modifiers:
+// __DEVICE__
+// __DEVICE_NOCE__ same as __DEVICE__ but no constexpr for those functions
+//                 that cannot return constexpr in c++.
+
+__DEVICE_NOCE__
 void __suppress_unused_warning(bool b){};
 template <unsigned int S, unsigned int T>
 __DEVICE__ void __static_assert_equal_size() {
@@ -55,7 +80,7 @@
 
 #endif
 
-__DEVICE__
+__DEVICE_NOCE__
 uint64_t __make_mantissa_base8(const char *__tagp) {
   uint64_t __r = 0;
   while (__tagp) {
@@ -72,7 +97,7 @@
   return __r;
 }
 
-__DEVICE__
+__DEVICE_NOCE__
 uint64_t __make_mantissa_base10(const char *__tagp) {
   uint64_t __r = 0;
   while (__tagp) {
@@ -89,7 +114,7 @@
   return __r;
 }
 
-__DEVICE__
+__DEVICE_NOCE__
 uint64_t __make_mantissa_base16(const char *__tagp) {
   uint64_t __r = 0;
   while (__tagp) {
@@ -244,13 +269,17 @@
 __DEVICE__
 float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
 
-__DEVICE__
+__DEVICE_NOCE__
 float frexpf(float __x, int *__nptr) {
+#ifdef __OPENMP_AMDGCN__
+  static __attribute__((address_space(5))) int __tmp;
+  float __r = __ocml_frexp_f32(__x, &__tmp);
+#else
   int __tmp;
   float __r =
       __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
+#endif
   *__nptr = __tmp;
-
   return __r;
 }
 
@@ -275,7 +304,7 @@
 __DEVICE__
 float j1f(float __x) { return __ocml_j1_f32(__x); }
 
-__DEVICE__
+__DEVICE_NOCE__
 float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication
                                 // and the Miller & Brown algorithm
   //       for linear recurrences to get O(log n) steps, but it's unclear if
@@ -329,15 +358,22 @@
 __DEVICE__
 long int lroundf(float __x) { return __ocml_round_f32(__x); }
 
-__DEVICE__
+__DEVICE_NOCE__
 float modff(float __x, float *__iptr) {
+#ifdef __OPENMP_AMDGCN__
+  static __attribute__((address_space(5))) float __tmp;
+  float __r = __ocml_modf_f32(__x, &__tmp);
+#else
   float __tmp;
   float __r =
       __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
+#endif
   *__iptr = __tmp;
   return __r;
 }
 
+// FIXME need a c version of nanf
+#if defined(__cplusplus)
 __DEVICE__
 float nanf(const char *__tagp) {
   union {
@@ -358,6 +394,7 @@
 
   return __tmp.val;
 }
+#endif
 
 __DEVICE__
 float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); }
@@ -383,7 +420,7 @@
 __DEVICE__
 float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }
 
-__DEVICE__
+__DEVICE_NOCE__
 float normf(int __dim,
             const float *__a) { // TODO: placeholder until OCML adds support.
   float __r = 0;
@@ -409,13 +446,17 @@
   return __ocml_remainder_f32(__x, __y);
 }
 
-__DEVICE__
+__DEVICE_NOCE__
 float remquof(float __x, float __y, int *__quo) {
+#ifdef __OPENMP_AMDGCN__
+  static __attribute__((address_space(5))) int __tmp;
+  float __r = __ocml_remquo_f32( __x, __y, &__tmp);
+#else
   int __tmp;
   float __r = __ocml_remquo_f32(
       __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
+#endif
   *__quo = __tmp;
-
   return __r;
 }
 
@@ -435,7 +476,7 @@
   return __ocml_rlen4_f32(__x, __y, __z, __w);
 }
 
-__DEVICE__
+__DEVICE_NOCE__
 float rnormf(int __dim,
              const float *__a) { // TODO: placeholder until OCML adds support.
   float __r = 0;
@@ -465,19 +506,29 @@
 __DEVICE__
 __RETURN_TYPE __signbitf(float __x) { return __ocml_signbit_f32(__x); }
 
-__DEVICE__
+__DEVICE_NOCE__
 void sincosf(float __x, float *__sinptr, float *__cosptr) {
+#ifdef __OPENMP_AMDGCN__
+  static __attribute__((address_space(5))) float __tmp;
+  *__sinptr = __ocml_sincos_f32(__x, &__tmp);
+#else
   float __tmp;
   *__sinptr =
       __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
+#endif
   *__cosptr = __tmp;
 }
 
-__DEVICE__
+__DEVICE_NOCE__
 void sincospif(float __x, float *__sinptr, float *__cosptr) {
+#ifdef __OPENMP_AMDGCN__
+  static __attribute__((address_space(5))) float __tmp;
+  *__sinptr = __ocml_sincospi_f32(__x, &__tmp);
+#else
   float __tmp;
   *__sinptr = __ocml_sincospi_f32(
       __x, (__attribute__((address_space(5))) float *)&__tmp);
+#endif
   *__cosptr = __tmp;
 }
 
@@ -511,7 +562,7 @@
 __DEVICE__
 float y1f(float __x) { return __ocml_y1_f32(__x); }
 
-__DEVICE__
+__DEVICE_NOCE__
 float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication
                                 // and the Miller & Brown algorithm
   //       for linear recurrences to get O(log n) steps, but it's unclear if
@@ -658,7 +709,7 @@
 __DEVICE__
 float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); }
 
-__DEVICE__
+__DEVICE_NOCE__
 void __sincosf(float __x, float *__sinptr, float *__cosptr) {
   *__sinptr = __ocml_native_sin_f32(__x);
   *__cosptr = __ocml_native_cos_f32(__x);
@@ -770,11 +821,16 @@
 __DEVICE__
 double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
 
-__DEVICE__
+__DEVICE_NOCE__
 double frexp(double __x, int *__nptr) {
+#ifdef __OPENMP_AMDGCN__
+  static __attribute__((address_space(5))) int __tmp;
+  double __r = __ocml_frexp_f64(__x, &__tmp);
+#else
   int __tmp;
   double __r =
       __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
+#endif
   *__nptr = __tmp;
   return __r;
 }
@@ -800,7 +856,7 @@
 __DEVICE__
 double j1(double __x) { return __ocml_j1_f64(__x); }
 
-__DEVICE__
+__DEVICE_NOCE__
 double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication
                                  // and the Miller & Brown algorithm
   //       for linear recurrences to get O(log n) steps, but it's unclear if
@@ -854,16 +910,22 @@
 __DEVICE__
 long int lround(double __x) { return __ocml_round_f64(__x); }
 
-__DEVICE__
+__DEVICE_NOCE__
 double modf(double __x, double *__iptr) {
+#ifdef __OPENMP_AMDGCN__
+  static __attribute__((address_space(5))) double __tmp;
+  double __r = __ocml_modf_f64(__x, &__tmp);
+#else
   double __tmp;
   double __r =
       __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
+#endif
   *__iptr = __tmp;
-
   return __r;
 }
 
+// FIXME need a c version of nan
+#if defined(__cplusplus)
 __DEVICE__
 double nan(const char *__tagp) {
 #if !_WIN32
@@ -891,6 +953,7 @@
   return *reinterpret_cast<double *>(&__val);
 #endif
 }
+#endif
 
 __DEVICE__
 double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); }
@@ -900,7 +963,7 @@
   return __ocml_nextafter_f64(__x, __y);
 }
 
-__DEVICE__
+__DEVICE_NOCE__
 double norm(int __dim,
             const double *__a) { // TODO: placeholder until OCML adds support.
   double __r = 0;
@@ -942,13 +1005,17 @@
   return __ocml_remainder_f64(__x, __y);
 }
 
-__DEVICE__
+__DEVICE_NOCE__
 double remquo(double __x, double __y, int *__quo) {
+#ifdef __OPENMP_AMDGCN__
+  static __attribute__((address_space(5))) int __tmp;
+  double __r = __ocml_remquo_f64(__x, __y, &__tmp);
+#else
   int __tmp;
   double __r = __ocml_remquo_f64(
       __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
+#endif
   *__quo = __tmp;
-
   return __r;
 }
 
@@ -958,7 +1025,7 @@
 __DEVICE__
 double rint(double __x) { return __ocml_rint_f64(__x); }
 
-__DEVICE__
+__DEVICE_NOCE__
 double rnorm(int __dim,
              const double *__a) { // TODO: placeholder until OCML adds support.
   double __r = 0;
@@ -1000,19 +1067,29 @@
 __DEVICE__
 double sin(double __x) { return __ocml_sin_f64(__x); }
 
-__DEVICE__
+__DEVICE_NOCE__
 void sincos(double __x, double *__sinptr, double *__cosptr) {
+#ifdef __OPENMP_AMDGCN__
+  static __attribute__((address_space(5))) double __tmp;
+  *__sinptr = __ocml_sincos_f64(__x, &__tmp);
+#else
   double __tmp;
   *__sinptr = __ocml_sincos_f64(
       __x, (__attribute__((address_space(5))) double *)&__tmp);
+#endif
   *__cosptr = __tmp;
 }
 
-__DEVICE__
+__DEVICE_NOCE__
 void sincospi(double __x, double *__sinptr, double *__cosptr) {
+#ifdef __OPENMP_AMDGCN__
+  static __attribute__((address_space(5))) double __tmp;
+  *__sinptr = __ocml_sincospi_f64(__x, &__tmp);
+#else
   double __tmp;
   *__sinptr = __ocml_sincospi_f64(
       __x, (__attribute__((address_space(5))) double *)&__tmp);
+#endif
   *__cosptr = __tmp;
 }
 
@@ -1043,7 +1120,7 @@
 __DEVICE__
 double y1(double __x) { return __ocml_y1_f64(__x); }
 
-__DEVICE__
+__DEVICE_NOCE__
 double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication
                                  // and the Miller & Brown algorithm
   //       for linear recurrences to get O(log n) steps, but it's unclear if
@@ -1231,6 +1308,7 @@
 __DEVICE__
 double min(double __x, double __y) { return fmin(__x, __y); }
 
+#ifndef __OPENMP_AMDGCN__
 __host__ inline static int min(int __arg1, int __arg2) {
   return std::min(__arg1, __arg2);
 }
@@ -1239,7 +1317,9 @@
   return std::max(__arg1, __arg2);
 }
 #endif
+#endif // defined(__cplusplus)
 
+#pragma pop_macro("__DEVICE_NOCE__")
 #pragma pop_macro("__DEVICE__")
 #pragma pop_macro("__RETURN_TYPE")
 
Index: clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
===================================================================
--- clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
+++ clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
@@ -14,11 +14,15 @@
 #error "This file is for OpenMP compilation only."
 #endif
 
+#ifdef __NVPTX__
 #pragma omp begin declare variant match(                                       \
     device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
 
 #ifdef __cplusplus
+#include <cstdint>
 extern "C" {
+#else
+#include <stdint.h>
 #endif
 
 #define __CUDA__
@@ -39,4 +43,41 @@
 
 #pragma omp end declare variant
 
+#endif // __NVPTX__
+
+#ifdef __AMDGCN__
+
+// __NO_INLINE__ prevents some x86 optimized macro definitions in system headers
+#define __NO_INLINE__ 1
+#pragma omp begin declare variant match(                                       \
+    device = {arch(amdgcn)}, implementation = {extension(match_any)})
+
+#ifdef __cplusplus
+#include <cstdint>
+extern "C" {
+#else
+#include <stdint.h>
+#endif
+
+#define __HIP__
+#define __OPENMP_AMDGCN__
+
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#define __private __attribute__((address_space(5)))
+
+/// Include declarations for libdevice functions.
+#include <__clang_hip_libdevice_declares.h>
+
+#ifdef __cplusplus
+} // extern "C"
 #endif
+
+#pragma omp end declare variant
+
+#endif // __AMDGCN__
+
+#endif // __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
Index: clang/lib/Headers/openmp_wrappers/cmath
===================================================================
--- clang/lib/Headers/openmp_wrappers/cmath
+++ clang/lib/Headers/openmp_wrappers/cmath
@@ -16,14 +16,16 @@
 
 #include_next <cmath>
 
-// Make sure we include our math.h overlay, it probably happend already but we
-// need to be sure.
+// Make sure we include our new and math.h overlays, it probably happened already
+// but we need to be sure.
+#include <new>
 #include <math.h>
 
 // We (might) need cstdlib because __clang_cuda_cmath.h below declares `abs`
 // which might live in cstdlib.
 #include <cstdlib>
 
+#ifdef __NVPTX__
 // We need limits because __clang_cuda_cmath.h below uses `std::numeric_limit`.
 #include <limits>
 
@@ -74,5 +76,17 @@
 #undef __DEVICE__
 
 #pragma omp end declare variant
+#endif // __NVPTX__
 
-#endif
+#ifdef __AMDGCN__
+#pragma omp begin declare variant match(                                       \
+    device = {arch(amdgcn)}, implementation = {extension(match_any, allow_templates)})
+#define __HIP__
+#define __OPENMP_AMDGCN__
+#include <__clang_hip_cmath.h>
+#undef __HIP__
+
+#pragma omp end declare variant
+#endif // __AMDGCN__
+
+#endif // __CLANG_OPENMP_CMATH_H__
Index: clang/lib/Headers/openmp_wrappers/complex
===================================================================
--- clang/lib/Headers/openmp_wrappers/complex
+++ clang/lib/Headers/openmp_wrappers/complex
@@ -15,11 +15,20 @@
 #endif
 
 // We require std::math functions in the complex builtins below.
+#ifdef __NVPTX__
 #include <cmath>
-
 #define __CUDA__
+#define __ARCHTYPES__ nvptx,nvptx64
+#endif // __NVPTX__
+
+#ifdef __AMDGCN__
+#include <__clang_hip_libdevice_declares.h>
+#define __HIP__
+#define __ARCHTYPES__ amdgcn
+#endif // __AMDGCN__
+
 #include <__clang_cuda_complex_builtins.h>
-#endif
+#endif //__CLANG_OPENMP_COMPLEX__
 
 // Grab the host header too.
 #include_next <complex>
@@ -36,15 +45,17 @@
 // arithmetic and calls to non-complex functions, all of which we can then
 // handle.
 #ifndef _LIBCPP_STD_VER
+#ifndef _GLIBCXX_COMPLEX
 
 #pragma omp begin declare variant match(                                       \
-    device = {arch(nvptx, nvptx64)},                                           \
+    device = {arch(__ARCHTYPES__)},                                            \
     implementation = {extension(match_any, allow_templates)})
 
 #include <complex_cmath.h>
 
 #pragma omp end declare variant
 
-#endif
+#endif // _GLIBCXX_COMPLEX
+#endif // _LIBCPP_STD_VER
 
-#endif
+#endif // __cplusplus
Index: clang/lib/Headers/openmp_wrappers/hip/hip_runtime.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/openmp_wrappers/hip/hip_runtime.h
@@ -0,0 +1,28 @@
+/*===-- hip_runtime - OpenMP hip_runtime.h wrapper for target regions ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_OPENMP_HIP_RUNTIME_H__
+#define __CLANG_OPENMP_HIP_RUNTIME_H__
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
+
+#include <time.h>
+#define __OPENMP_AMDGCN__
+#include_next <hip/hip_runtime.h>
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(amdgcn)}, implementation = {extension(match_any)})
+
+#pragma omp end declare variant
+
+// Now get the actual hip headers
+
+#endif // __CLANG_OPENMP_HIP_RUNTIME_H__
Index: clang/lib/Headers/openmp_wrappers/math.h
===================================================================
--- clang/lib/Headers/openmp_wrappers/math.h
+++ clang/lib/Headers/openmp_wrappers/math.h
@@ -48,4 +48,18 @@
 
 #pragma omp end declare variant
 
+#pragma omp begin declare variant match(                                       \
+    device = {arch(amdgcn)}, implementation = {extension(match_any)})
+
+#ifndef __OPENMP_AMDGCN__
+#define __OPENMP_AMDGCN__
+#endif
+
+#ifndef __HIP__
+#define __HIP__
 #endif
+
+#include <__clang_hip_math.h>
+#pragma omp end declare variant
+
+#endif // __CLANG_OPENMP_MATH_H__
Index: clang/lib/Headers/openmp_wrappers/new
===================================================================
--- clang/lib/Headers/openmp_wrappers/new
+++ clang/lib/Headers/openmp_wrappers/new
@@ -11,7 +11,7 @@
 
 #include_next <new>
 
-#if defined(__NVPTX__) && defined(_OPENMP)
+#if (defined(__NVPTX__) || defined(__AMDGCN__))  && defined(_OPENMP)
 
 #include <cstdlib>
 
Index: clang/test/Headers/Inputs/include/algorithm
===================================================================
--- /dev/null
+++ clang/test/Headers/Inputs/include/algorithm
@@ -0,0 +1,21 @@
+#pragma once
+
+// Copied from libcxx
+
+namespace std {
+
+template <class T>
+    const T&
+    max(const T& a, const T& b); // constexpr in C++14
+template <class T, class Compare>
+    const T&
+    max(const T& a, const T& b, Compare comp);  // constexpr in C++14
+
+template <class T>
+    const T&
+    min(const T& a, const T& b);  // constexpr in C++14
+template <class T, class Compare>
+    const T&
+    min(const T& a, const T& b, Compare comp);  // constexpr in C++14
+
+}
Index: clang/test/Headers/Inputs/include/cstdint
===================================================================
--- /dev/null
+++ clang/test/Headers/Inputs/include/cstdint
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <stdint.h>
+
+namespace std {
+#ifdef __INT32_TYPE__
+using ::uint32_t;
+#endif
+
+#ifdef __INT64_TYPE__
+using ::uint64_t;
+#endif
+
+#ifdef __INTPTR_TYPE__
+using ::intptr_t;
+using ::uintptr_t;
+#else
+#error Every target should have __INTPTR_TYPE__
+#endif
+
+} // namespace std
Index: clang/test/Headers/Inputs/include/cstdlib
===================================================================
--- clang/test/Headers/Inputs/include/cstdlib
+++ clang/test/Headers/Inputs/include/cstdlib
@@ -16,6 +16,7 @@
 {
 
 using ::abs;
+using ::size_t;
 
 inline long
 abs(long __i) { return __builtin_labs(__i); }
Index: clang/test/Headers/Inputs/include/exception
===================================================================
--- /dev/null
+++ clang/test/Headers/Inputs/include/exception
@@ -0,0 +1,17 @@
+#pragma once
+
+// Copied from libcxx
+
+namespace std {
+
+class exception
+{
+public:
+    exception() noexcept;
+    exception(const exception&) noexcept;
+    exception& operator=(const exception&) noexcept;
+    virtual ~exception() noexcept;
+    virtual const char* what() const noexcept;
+};
+
+}
Index: clang/test/Headers/Inputs/include/new
===================================================================
--- /dev/null
+++ clang/test/Headers/Inputs/include/new
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <cstdlib>
+#include <exception>
+
+// Copied from libcxx
+
+namespace std {
+
+struct nothrow_t { explicit nothrow_t() = default; };
+extern const nothrow_t nothrow;
+
+class bad_alloc
+    : public exception
+{
+public:
+    bad_alloc();
+    virtual ~bad_alloc();
+    virtual const char* what() const;
+};
+
+class bad_array_new_length
+    : public bad_alloc
+{
+public:
+    bad_array_new_length();
+    virtual ~bad_array_new_length();
+    virtual const char* what() const;
+};
+
+typedef void (*new_handler)();
+new_handler set_new_handler(new_handler);
+new_handler get_new_handler();
+
+}
+
+void* operator new(std::size_t __sz);
+void* operator new(std::size_t __sz, const std::nothrow_t&);
+void  operator delete(void* __p);
+void  operator delete(void* __p, const std::nothrow_t&);
+
+void* operator new[](std::size_t __sz);
+void* operator new[](std::size_t __sz, const std::nothrow_t&);
+void  operator delete[](void* __p);
+void  operator delete[](void* __p, const std::nothrow_t&);
+
+inline void* operator new  (std::size_t, void* __p) {return __p;}
+inline void* operator new[](std::size_t, void* __p) {return __p;}
+inline void  operator delete  (void*, void*) {}
+inline void  operator delete[](void*, void*) {}
Index: clang/test/Headers/Inputs/include/stdlib.h
===================================================================
--- clang/test/Headers/Inputs/include/stdlib.h
+++ clang/test/Headers/Inputs/include/stdlib.h
@@ -4,3 +4,6 @@
 #ifndef __cplusplus
 extern int abs(int __x) __attribute__((__const__));
 #endif
+
+void free(void* ptr);
+void* malloc(size_t size);