diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -113,6 +113,8 @@
   ${source_directory}/Tasking.cpp
   ${source_directory}/Utils.cpp
   ${source_directory}/Workshare.cpp
+  ${source_directory}/Xteamr.cpp
+  ${source_directory}/Rfuns.cpp
 )
 
 set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512)
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -239,6 +239,320 @@
     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
     ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
     ListGlobalFnTy glredFct);
+///
+///  Helper functions for high performance cross team reductions.
+///
+///    Name decoder:  __kmpc_xteamr_<dtype>_<waves>x<warpsize>
+///       <waves> number of warps/waves in team
+///       <warpsize> 32 or 64
+///       example: __kmpc_xteam_f_16x64 is the helper function for data
+///                type float with fixed teamsize of 1024 threads.
+///    There are 48(6 X 8) helper functions.
+///    6 configurations of teamsize are currently created.
+///    The most performant configs use max teamsize 1024: 16x64 and 32x32.
+///    Currently the Other configs are 8x64, 4x64, 16x32, and 8x32.
+///    8 data types available: double, float, int, uint, long, ulong,
+///    double  _Complex, and float _complex
+///    All xteam helper functions have these 7 args:
+///      arg1: the thread local reduction value
+///      arg2: pointer to where result is written
+///      arg3: global array of team values for this reduction instance
+///      arg4: atomic counter of completed teams for this reduction instance
+///      arg5: void function pointer of  pair reduction function,
+///            (e.g. sum(&a,b),min(&a,b),max(&a,b)
+///      arg6: equivalent (to arg5) void function pointer of pair reduction
+///            function on LDS memory, (e.g. sum(&a,&b),min(&a,&b),max(&a,&b)
+///      arg7: Initializing value for the reduction type
+///
+#define _RF_LDS volatile __attribute__((address_space(3)))
+void __kmpc_xteamr_d_16x64(double v, double *r_ptr, double *tvals,
+                           uint32_t *td_ptr, void (*_rf)(double *, double),
+                           void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                           const double iv);
+void __kmpc_xteamr_f_16x64(float v, float *r_ptr, float *tvals,
+                           uint32_t *td_ptr, void (*_rf)(float *, float),
+                           void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                           const float iv);
+void __kmpc_xteamr_cd_16x64(double _Complex v, double _Complex *r_ptr,
+                            double _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(double _Complex *, double _Complex),
+                            void (*_rf_lds)(_RF_LDS double _Complex *,
+                                            _RF_LDS double _Complex *),
+                            const double _Complex iv);
+void __kmpc_xteamr_cf_16x64(float _Complex v, float _Complex *r_ptr,
+                            float _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(float _Complex *, float _Complex),
+                            void (*_rf_lds)(_RF_LDS float _Complex *,
+                                            _RF_LDS float _Complex *),
+                            const float _Complex iv);
+void __kmpc_xteamr_i_16x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                           void (*_rf)(int *, int),
+                           void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                           const int iv);
+void __kmpc_xteamr_ui_16x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                            void (*_rf_lds)(_RF_LDS uint32_t *,
+                                            _RF_LDS uint32_t *),
+                            const uint32_t iv);
+void __kmpc_xteamr_l_16x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                           void (*_rf)(long *, long),
+                           void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                           const long iv);
+void __kmpc_xteamr_ul_16x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                            void (*_rf_lds)(_RF_LDS uint64_t *,
+                                            _RF_LDS uint64_t *),
+                            const uint64_t iv);
+void __kmpc_xteamr_d_8x64(double v, double *r_ptr, double *tvals,
+                          uint32_t *td_ptr, void (*_rf)(double *, double),
+                          void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                          const double iv);
+void __kmpc_xteamr_f_8x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                          void (*_rf)(float *, float),
+                          void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                          const float iv);
+void __kmpc_xteamr_cd_8x64(double _Complex v, double _Complex *r_ptr,
+                           double _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(double _Complex *, double _Complex),
+                           void (*_rf_lds)(_RF_LDS double _Complex *,
+                                           _RF_LDS double _Complex *),
+                           const double _Complex iv);
+void __kmpc_xteamr_cf_8x64(float _Complex v, float _Complex *r_ptr,
+                           float _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(float _Complex *, float _Complex),
+                           void (*_rf_lds)(_RF_LDS float _Complex *,
+                                           _RF_LDS float _Complex *),
+                           const float _Complex iv);
+void __kmpc_xteamr_i_8x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                          void (*_rf)(int *, int),
+                          void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                          const int iv);
+void __kmpc_xteamr_ui_8x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                           void (*_rf_lds)(_RF_LDS uint32_t *,
+                                           _RF_LDS uint32_t *),
+                           const uint32_t iv);
+void __kmpc_xteamr_l_8x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                          void (*_rf)(long *, long),
+                          void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                          const long iv);
+void __kmpc_xteamr_ul_8x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                           void (*_rf_lds)(_RF_LDS uint64_t *,
+                                           _RF_LDS uint64_t *),
+                           const uint64_t iv);
+void __kmpc_xteamr_d_4x64(double v, double *r_ptr, double *tvals,
+                          uint32_t *td_ptr, void (*_rf)(double *, double),
+                          void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                          const double iv);
+void __kmpc_xteamr_f_4x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                          void (*_rf)(float *, float),
+                          void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                          const float iv);
+void __kmpc_xteamr_cd_4x64(double _Complex v, double _Complex *r_ptr,
+                           double _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(double _Complex *, double _Complex),
+                           void (*_rf_lds)(_RF_LDS double _Complex *,
+                                           _RF_LDS double _Complex *),
+                           const double _Complex iv);
+void __kmpc_xteamr_cf_4x64(float _Complex v, float _Complex *r_ptr,
+                           float _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(float _Complex *, float _Complex),
+                           void (*_rf_lds)(_RF_LDS float _Complex *,
+                                           _RF_LDS float _Complex *),
+                           const float _Complex iv);
+void __kmpc_xteamr_i_4x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                          void (*_rf)(int *, int),
+                          void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                          const int iv);
+void __kmpc_xteamr_ui_4x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                           void (*_rf_lds)(_RF_LDS uint32_t *,
+                                           _RF_LDS uint32_t *),
+                           const uint32_t iv);
+void __kmpc_xteamr_l_4x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                          void (*_rf)(long *, long),
+                          void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                          const long iv);
+void __kmpc_xteamr_ul_4x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                           void (*_rf_lds)(_RF_LDS uint64_t *,
+                                           _RF_LDS uint64_t *),
+                           const uint64_t iv);
+void __kmpc_xteamr_d_32x32(double v, double *r_ptr, double *tvals,
+                           uint32_t *td_ptr, void (*_rf)(double *, double),
+                           void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                           const double iv);
+void __kmpc_xteamr_f_32x32(float v, float *r_ptr, float *tvals,
+                           uint32_t *td_ptr, void (*_rf)(float *, float),
+                           void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                           const float iv);
+void __kmpc_xteamr_cd_32x32(double _Complex v, double _Complex *r_ptr,
+                            double _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(double _Complex *, double _Complex),
+                            void (*_rf_lds)(_RF_LDS double _Complex *,
+                                            _RF_LDS double _Complex *),
+                            const double _Complex iv);
+void __kmpc_xteamr_cf_32x32(float _Complex v, float _Complex *r_ptr,
+                            float _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(float _Complex *, float _Complex),
+                            void (*_rf_lds)(_RF_LDS float _Complex *,
+                                            _RF_LDS float _Complex *),
+                            const float _Complex iv);
+
+void __kmpc_xteamr_i_32x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                           void (*_rf)(int *, int),
+                           void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                           const int iv);
+
+void __kmpc_xteamr_ui_32x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                            void (*_rf_lds)(_RF_LDS uint32_t *,
+                                            _RF_LDS uint32_t *),
+                            const uint32_t iv);
+void __kmpc_xteamr_l_32x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                           void (*_rf)(long *, long),
+                           void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                           const long iv);
+void __kmpc_xteamr_ul_32x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                            void (*_rf_lds)(_RF_LDS uint64_t *,
+                                            _RF_LDS uint64_t *),
+                            const uint64_t iv);
+void __kmpc_xteamr_d_16x32(double v, double *r_ptr, double *tvals,
+                           uint32_t *td_ptr, void (*_rf)(double *, double),
+                           void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                           const double iv);
+void __kmpc_xteamr_f_16x32(float v, float *r_ptr, float *tvals,
+                           uint32_t *td_ptr, void (*_rf)(float *, float),
+                           void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                           const float iv);
+void __kmpc_xteamr_cd_16x32(double _Complex v, double _Complex *r_ptr,
+                            double _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(double _Complex *, double _Complex),
+                            void (*_rf_lds)(_RF_LDS double _Complex *,
+                                            _RF_LDS double _Complex *),
+                            const double _Complex iv);
+void __kmpc_xteamr_cf_16x32(float _Complex v, float _Complex *r_ptr,
+                            float _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(float _Complex *, float _Complex),
+                            void (*_rf_lds)(_RF_LDS float _Complex *,
+                                            _RF_LDS float _Complex *),
+                            const float _Complex iv);
+
+void __kmpc_xteamr_i_16x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                           void (*_rf)(int *, int),
+                           void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                           const int iv);
+void __kmpc_xteamr_ui_16x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                            void (*_rf_lds)(_RF_LDS uint32_t *,
+                                            _RF_LDS uint32_t *),
+                            const uint32_t iv);
+void __kmpc_xteamr_l_16x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                           void (*_rf)(long *, long),
+                           void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                           const long iv);
+void __kmpc_xteamr_ul_16x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                            void (*_rf_lds)(_RF_LDS uint64_t *,
+                                            _RF_LDS uint64_t *),
+                            const uint64_t iv);
+void __kmpc_xteamr_d_8x32(double v, double *r_ptr, double *tvals,
+                          uint32_t *td_ptr, void (*_rf)(double *, double),
+                          void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                          const double iv);
+void __kmpc_xteamr_f_8x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                          void (*_rf)(float *, float),
+                          void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                          const float iv);
+void __kmpc_xteamr_cd_8x32(double _Complex v, double _Complex *r_ptr,
+                           double _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(double _Complex *, double _Complex),
+                           void (*_rf_lds)(_RF_LDS double _Complex *,
+                                           _RF_LDS double _Complex *),
+                           const double _Complex iv);
+void __kmpc_xteamr_cf_8x32(float _Complex v, float _Complex *r_ptr,
+                           float _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(float _Complex *, float _Complex),
+                           void (*_rf_lds)(_RF_LDS float _Complex *,
+                                           _RF_LDS float _Complex *),
+                           const float _Complex iv);
+void __kmpc_xteamr_i_8x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                          void (*_rf)(int *, int),
+                          void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                          const int iv);
+void __kmpc_xteamr_ui_8x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                           void (*_rf_lds)(_RF_LDS uint32_t *,
+                                           _RF_LDS uint32_t *),
+                           const uint32_t iv);
+void __kmpc_xteamr_l_8x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                          void (*_rf)(long *, long),
+                          void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                          const long iv);
+void __kmpc_xteamr_ul_8x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                           void (*_rf_lds)(_RF_LDS uint64_t *,
+                                           _RF_LDS uint64_t *),
+                           const uint64_t iv);
+
+///  Builtin pair reduction functions.
+///    These become function pointers for arg5 and arg6 of xteamr above.
+///    Each pair reduction function must have two variants to support xteamr.
+///    The 1st is for TLS memory and the 2nd is for LDS (scratchpad) memory.
+///    These are defined in Rfuns.cpp.  User defined reductions require
+///    that Clang codegen generate these functions.
+void __kmpc_rfun_sum_d(double *val, double otherval);
+void __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval);
+void __kmpc_rfun_sum_f(float *val, float otherval);
+void __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval);
+void __kmpc_rfun_sum_cd(double _Complex *val, double _Complex otherval);
+void __kmpc_rfun_sum_lds_cd(_RF_LDS double _Complex *val,
+                            _RF_LDS double _Complex *otherval);
+void __kmpc_rfun_sum_cf(float _Complex *val, float _Complex otherval);
+void __kmpc_rfun_sum_lds_cf(_RF_LDS float _Complex *val,
+                            _RF_LDS float _Complex *otherval);
+void __kmpc_rfun_sum_i(int *val, int otherval);
+void __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval);
+void __kmpc_rfun_sum_ui(unsigned int *val, unsigned int otherval);
+void __kmpc_rfun_sum_lds_ui(_RF_LDS unsigned int *val,
+                            _RF_LDS unsigned int *otherval);
+void __kmpc_rfun_sum_l(long *val, long otherval);
+void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
+void __kmpc_rfun_sum_ul(unsigned long *val, unsigned long otherval);
+void __kmpc_rfun_sum_lds_ul(_RF_LDS unsigned long *val,
+                            _RF_LDS unsigned long *otherval);
+/// Complex variables have no compare, so no min or max for cf and cd.
+void __kmpc_rfun_min_d(double *val, double otherval);
+void __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval);
+void __kmpc_rfun_min_f(float *val, float otherval);
+void __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval);
+void __kmpc_rfun_min_i(int *val, int otherval);
+void __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval);
+void __kmpc_rfun_min_ui(unsigned int *val, unsigned int otherval);
+void __kmpc_rfun_min_lds_ui(_RF_LDS unsigned int *val,
+                            _RF_LDS unsigned int *otherval);
+void __kmpc_rfun_min_l(long *val, long otherval);
+void __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
+void __kmpc_rfun_min_ul(unsigned long *val, unsigned long otherval);
+void __kmpc_rfun_min_lds_ul(_RF_LDS unsigned long *val,
+                            _RF_LDS unsigned long *otherval);
+void __kmpc_rfun_max_d(double *val, double otherval);
+void __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval);
+void __kmpc_rfun_max_f(float *val, float otherval);
+void __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval);
+void __kmpc_rfun_max_i(int *val, int otherval);
+void __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval);
+void __kmpc_rfun_max_ui(unsigned int *val, unsigned int otherval);
+void __kmpc_rfun_max_lds_ui(_RF_LDS unsigned int *val,
+                            _RF_LDS unsigned int *otherval);
+void __kmpc_rfun_max_l(long *val, long otherval);
+void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
+void __kmpc_rfun_max_ul(unsigned long *val, unsigned long otherval);
+void __kmpc_rfun_max_lds_ul(_RF_LDS unsigned long *val,
+                            _RF_LDS unsigned long *otherval);
+#undef _RF_LDS
 ///}
 
 /// Synchronization
diff --git a/openmp/libomptarget/DeviceRTL/src/Rfuns.cpp b/openmp/libomptarget/DeviceRTL/src/Rfuns.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Rfuns.cpp
@@ -0,0 +1,145 @@
+//===---- Rfuns.cpp - OpenMP reduction functions ---- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains simple reduction functions used as function pointers to
+// reduction helper functions for cross team reductions defined in Xteamr.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp declare target
+
+#define _RF_ATTR extern "C" __attribute__((flatten, always_inline)) void
+#define _RF_LDS volatile __attribute__((address_space(3)))
+
+_RF_ATTR __kmpc_rfun_sum_d(double *val, double otherval) { *val += otherval; }
+_RF_ATTR __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {
+  *val += *otherval;
+}
+_RF_ATTR __kmpc_rfun_sum_f(float *val, float otherval) { *val += otherval; }
+_RF_ATTR __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) {
+  *val += *otherval;
+}
+_RF_ATTR __kmpc_rfun_sum_cd(double _Complex *val, double _Complex otherval) {
+  *val += otherval;
+}
+_RF_ATTR __kmpc_rfun_sum_lds_cd(_RF_LDS double _Complex *val,
+                                _RF_LDS double _Complex *otherval) {
+  *val += *otherval;
+}
+_RF_ATTR __kmpc_rfun_sum_cf(float _Complex *val, float _Complex otherval) {
+  *val += otherval;
+}
+_RF_ATTR __kmpc_rfun_sum_lds_cf(_RF_LDS float _Complex *val,
+                                _RF_LDS float _Complex *otherval) {
+  *val += *otherval;
+}
+_RF_ATTR __kmpc_rfun_sum_i(int *val, int otherval) { *val += otherval; }
+_RF_ATTR __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) {
+  *val += *otherval;
+}
+_RF_ATTR __kmpc_rfun_sum_ui(unsigned int *val, unsigned int otherval) {
+  *val += otherval;
+}
+_RF_ATTR __kmpc_rfun_sum_lds_ui(_RF_LDS unsigned int *val,
+                                _RF_LDS unsigned int *otherval) {
+  *val += *otherval;
+}
+_RF_ATTR __kmpc_rfun_sum_l(long *val, long otherval) { *val += otherval; }
+_RF_ATTR __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) {
+  *val += *otherval;
+}
+_RF_ATTR __kmpc_rfun_sum_ul(unsigned long *val, unsigned long otherval) {
+  *val += otherval;
+}
+_RF_ATTR __kmpc_rfun_sum_lds_ul(_RF_LDS unsigned long *val,
+                                _RF_LDS unsigned long *otherval) {
+  *val += *otherval;
+}
+
+_RF_ATTR __kmpc_rfun_min_d(double *val, double otherval) {
+  *val = (otherval < *val) ? otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {
+  *val = (*otherval < *val) ? *otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_min_f(float *val, float otherval) {
+  *val = (otherval < *val) ? otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) {
+  *val = (*otherval < *val) ? *otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_min_i(int *val, int otherval) {
+  *val = (otherval < *val) ? otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) {
+  *val = (*otherval < *val) ? *otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_min_ui(unsigned int *val, unsigned int otherval) {
+  *val = (otherval < *val) ? otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_min_lds_ui(_RF_LDS unsigned int *val,
+                                _RF_LDS unsigned int *otherval) {
+  *val = (*otherval < *val) ? *otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_min_l(long *val, long otherval) {
+  *val = (otherval < *val) ? otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) {
+  *val = (*otherval < *val) ? *otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_min_ul(unsigned long *val, unsigned long otherval) {
+  *val = (otherval < *val) ? otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_min_lds_ul(_RF_LDS unsigned long *val,
+                                _RF_LDS unsigned long *otherval) {
+  *val = (*otherval < *val) ? *otherval : *val;
+}
+
+_RF_ATTR __kmpc_rfun_max_d(double *val, double otherval) {
+  *val = (otherval > *val) ? otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {
+  *val = (*otherval > *val) ? *otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_max_f(float *val, float otherval) {
+  *val = (otherval > *val) ? otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) {
+  *val = (*otherval > *val) ? *otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_max_i(int *val, int otherval) {
+  *val = (otherval > *val) ? otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) {
+  *val = (*otherval > *val) ? *otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_max_ui(unsigned int *val, unsigned int otherval) {
+  *val = (otherval > *val) ? otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_max_lds_ui(_RF_LDS unsigned int *val,
+                                _RF_LDS unsigned int *otherval) {
+  *val = (*otherval > *val) ? *otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_max_l(long *val, long otherval) {
+  *val = (otherval > *val) ? otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) {
+  *val = (*otherval > *val) ? *otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_max_ul(unsigned long *val, unsigned long otherval) {
+  *val = (otherval > *val) ? otherval : *val;
+}
+_RF_ATTR __kmpc_rfun_max_lds_ul(_RF_LDS unsigned long *val,
+                                _RF_LDS unsigned long *otherval) {
+  *val = (*otherval > *val) ? *otherval : *val;
+}
+
+#undef _RF_ATTR
+#undef _RF_LDS
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp b/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp
@@ -0,0 +1,704 @@
+//===---- Xteamr.cpp - OpenMP cross team helper functions ---- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains helper functions for cross team reductions
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+#define __XTEAM_SHARED_LDS volatile __attribute__((address_space(3)))
+
+using namespace _OMP;
+
+#pragma omp declare target
+// Headers for specialized shfl_xor
+double xteamr_shfl_xor_d(double var, const int lane_mask, const uint32_t width);
+float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width);
+int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width);
+double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask,
+                                   const uint32_t width);
+float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask,
+                                  const uint32_t width);
+
+// Define the arch (amdgcn vs nvptx) variants of shfl
+
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) {
+  int self = mapping::getThreadIdInWarp(); // __lane_id();
+  int index = self ^ lane_mask;
+  index = index >= ((self + width) & ~(width - 1)) ? self : index;
+  return __builtin_amdgcn_ds_bpermute(index << 2, var);
+}
+float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = var;
+  tmp.i = xteamr_shfl_xor_int(tmp.i, lane_mask, width);
+  return tmp.f;
+}
+double xteamr_shfl_xor_d(double var, const int lane_mask,
+                         const uint32_t width) {
+  static_assert(sizeof(double) == 2 * sizeof(int), "");
+  static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = xteamr_shfl_xor_int(tmp[0], lane_mask, width);
+  tmp[1] = xteamr_shfl_xor_int(tmp[1], lane_mask, width);
+
+  uint64_t tmp0 =
+      (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+  double tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+
+double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask,
+                                   const uint32_t width) {
+  __real__(var) = xteamr_shfl_xor_d(__real__(var), lane_mask, width);
+  __imag__(var) = xteamr_shfl_xor_d(__imag__(var), lane_mask, width);
+  return var;
+}
+float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask,
+                                  const uint32_t width) {
+  __real__(var) = xteamr_shfl_xor_f(__real__(var), lane_mask, width);
+  __imag__(var) = xteamr_shfl_xor_f(__imag__(var), lane_mask, width);
+  return var;
+}
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) {
+  int c = ((32 - width) << 8) | 0x1f;
+  return __nvvm_shfl_sync_bfly_i32(0xFFFFFFFF, var, lane_mask, c);
+}
+float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = var;
+  tmp.i = xteamr_shfl_xor_int(tmp.i, lane_mask, width);
+  return tmp.f;
+}
+double xteamr_shfl_xor_d(double var, int laneMask, const uint32_t width) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+  hi = xteamr_shfl_xor_int(hi, laneMask, width);
+  lo = xteamr_shfl_xor_int(lo, laneMask, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+  return var;
+}
+double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask,
+                                   const uint32_t width) {
+  __real__(var) = xteamr_shfl_xor_d(__real__(var), lane_mask, width);
+  __imag__(var) = xteamr_shfl_xor_d(__imag__(var), lane_mask, width);
+  return var;
+}
+float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask,
+                                  const uint32_t width) {
+  __real__(var) = xteamr_shfl_xor_f(__real__(var), lane_mask, width);
+  __imag__(var) = xteamr_shfl_xor_f(__imag__(var), lane_mask, width);
+  return var;
+}
+#pragma omp end declare variant
+// } // end impl namespace
+
+// tag dispatching of type specific shfl_xor, get_low, and get_high
+struct _d_tag {};
+struct _f_tag {};
+struct _cd_tag {};
+struct _cf_tag {};
+struct _i_tag {};
+struct _ui_tag {};
+struct _l_tag {};
+struct _ul_tag {};
+template <typename T> struct __dispatch_tag;
+template <> struct __dispatch_tag<double> {
+  typedef _d_tag type;
+};
+template <> struct __dispatch_tag<float> {
+  typedef _f_tag type;
+};
+template <> struct __dispatch_tag<double _Complex> {
+  typedef _cd_tag type;
+};
+template <> struct __dispatch_tag<float _Complex> {
+  typedef _cf_tag type;
+};
+template <> struct __dispatch_tag<int> {
+  typedef _i_tag type;
+};
+template <> struct __dispatch_tag<unsigned int> {
+  typedef _ui_tag type;
+};
+template <> struct __dispatch_tag<long> {
+  typedef _l_tag type;
+};
+template <> struct __dispatch_tag<unsigned long> {
+  typedef _ul_tag type;
+};
+template <const uint32_t _WSZ>
+double xteamr_shfl_xor(_d_tag tag, double var, const int lane_mask) {
+  return xteamr_shfl_xor_d(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+float xteamr_shfl_xor(_f_tag tag, float var, const int lane_mask) {
+  return xteamr_shfl_xor_f(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+double _Complex xteamr_shfl_xor(_cd_tag tag, double _Complex var,
+                                const int lane_mask) {
+  return xteamr_shfl_xor_cd(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+float _Complex xteamr_shfl_xor(_cf_tag tag, float _Complex var,
+                               const int lane_mask) {
+  return xteamr_shfl_xor_cf(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+int xteamr_shfl_xor(_i_tag tag, int var, const int lane_mask) {
+  return xteamr_shfl_xor_int(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+unsigned int xteamr_shfl_xor(_ui_tag tag, unsigned int var,
+                             const int lane_mask) {
+  return xteamr_shfl_xor_int(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+long xteamr_shfl_xor(_l_tag tag, long var, const int lane_mask) {
+  return xteamr_shfl_xor_d(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+unsigned long xteamr_shfl_xor(_ul_tag tag, unsigned long var,
+                              const int lane_mask) {
+  return xteamr_shfl_xor_d(var, lane_mask, _WSZ);
+}
+
+template <typename T, const uint32_t _WSZ>
+T xteamr_shfl_xor(T var, const int lane_mask) {
+  typedef typename __dispatch_tag<T>::type tag;
+  return xteamr_shfl_xor<_WSZ>(tag(), var, lane_mask);
+}
+
+template <typename T, const int32_t _NW, const int32_t _WSZ>
+__attribute__((flatten, always_inline)) void _xteam_reduction(
+    T val, T *r_ptr, T *team_vals, uint32_t *teams_done_ptr,
+    void (*_rf)(T *, T),
+    void (*_rf_lds)(__XTEAM_SHARED_LDS T *, __XTEAM_SHARED_LDS T *),
+    const T inival) {
+
+  constexpr int32_t _NT = _NW * _WSZ;
+  const uint32_t omp_thread_num = mapping::getThreadIdInBlock();
+  const uint32_t omp_team_num = mapping::getBlockId();
+  const uint32_t wave_num = mapping::getWarpId();         // 0 15
+  const uint32_t lane_num = mapping::getThreadIdInWarp(); //  0 63
+  const uint32_t NumTeams = mapping::getNumberOfBlocks();
+  static __XTEAM_SHARED_LDS T xwave_lds[_NW];
+  static __XTEAM_SHARED_LDS bool __is_last_team;
+
+  // Binary reduce each wave, then copy to xwave_lds[wave_num]
+  for (unsigned int offset = _WSZ / 2; offset > 0; offset >>= 1)
+    (*_rf)(&val, xteamr_shfl_xor<T, _WSZ>(val, offset));
+  if (lane_num == 0)
+    xwave_lds[wave_num] = val;
+
+  // Binary reduce each wave's value into wave_lds[0] with lds memory.
+  _OMP::synchronize::threads();
+  for (unsigned int offset = _NW / 2; offset > 0; offset >>= 1) {
+    if (omp_thread_num < offset)
+      (*_rf_lds)(&(xwave_lds[omp_thread_num]),
+                 &(xwave_lds[omp_thread_num + offset]));
+  }
+  _OMP::synchronize::threads();
+
+  // Discover the last team to complete cross wave reduction
+  // The team number of last team is nondeterministic.
+  __is_last_team = false;
+  if (omp_thread_num == 0) {
+    team_vals[omp_team_num] = xwave_lds[0];
+    uint32_t td = atomic::inc(teams_done_ptr, NumTeams - 1u, __ATOMIC_SEQ_CST);
+    if (td == (NumTeams - 1u))
+      __is_last_team = true;
+  }
+
+  // This sync needed, so all threads from last team know they are in the last
+  // team.
+  _OMP::synchronize::threads();
+
+  if (__is_last_team) {
+    // All threads from last completed team enter here.
+    // All other teams exit.
+    if (omp_thread_num < NumTeams)
+      val = team_vals[omp_thread_num];
+    else
+      val = inival;
+
+    // Reduce each wave into xwave_lds[wave_num]
+    for (unsigned int offset = _WSZ / 2; offset > 0; offset >>= 1)
+      (*_rf)(&val, xteamr_shfl_xor<T, _WSZ>(val, offset));
+    if (lane_num == 0)
+      xwave_lds[wave_num] = val;
+
+    // For final result, inside next if clause, We know wave_lds[0] is done
+    // But we need a sync here to ensure wave_lds[i!=0] is correct.
+    _OMP::synchronize::threads();
+
+    // Typically only 2 useable waves when <128 CUs. No gain to parallelizing
+    // these last 2 reductions. So do these on thread 0 into lane 0's val.
+    if (omp_thread_num == 0) {
+      unsigned int usableWaves = ((NumTeams - 1) / _WSZ) + 1;
+      for (unsigned int kk = 1; kk < usableWaves; kk++)
+        (*_rf_lds)(&xwave_lds[0], &xwave_lds[kk]);
+
+      // Reduce with the original result value.
+      xwave_lds[1] = *r_ptr;
+      (*_rf_lds)(&xwave_lds[0], &xwave_lds[1]);
+      *r_ptr = xwave_lds[0];
+    }
+  }
+}
+
+//  Calls to these __kmpc extern C functions are created in clang codegen
+//  for FORTRAN, c, and C++. They may also be used for sumulation and testing.
+//  The headers for these extern C functions are in ../include/Interface.h
+//  The compiler builds the name based on data type,
+//  number of waves in the team,and warpsize.
+//
+#define _EXT_ATTR extern "C" __attribute__((flatten, always_inline)) void
+_EXT_ATTR
+__kmpc_xteamr_d_16x64(double v, double *r_ptr, double *tvals, uint32_t *td_ptr,
+                      void (*_rf)(double *, double),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS double *,
+                                      __XTEAM_SHARED_LDS double *),
+                      const double iv) {
+  _xteam_reduction<double, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_f_16x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                      void (*_rf)(float *, float),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS float *,
+                                      __XTEAM_SHARED_LDS float *),
+                      const float iv) {
+  _xteam_reduction<float, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+_EXT_ATTR
+__kmpc_xteamr_cd_16x64(double _Complex v, double _Complex *r_ptr,
+                       double _Complex *tvals, uint32_t *td_ptr,
+                       void (*_rf)(double _Complex *, double _Complex),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *,
+                                       __XTEAM_SHARED_LDS double _Complex *),
+                       double _Complex iv) {
+  _xteam_reduction<double _Complex, 16, 64>(v, r_ptr, tvals, td_ptr, _rf,
+                                            _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cf_16x64(float _Complex v, float _Complex *r_ptr,
+                       float _Complex *tvals, uint32_t *td_ptr,
+                       void (*_rf)(float _Complex *, float _Complex),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *,
+                                       __XTEAM_SHARED_LDS float _Complex *),
+                       float _Complex iv) {
+  _xteam_reduction<float _Complex, 16, 64>(v, r_ptr, tvals, td_ptr, _rf,
+                                           _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_i_16x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                      void (*_rf)(int *, int),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS int *,
+                                      __XTEAM_SHARED_LDS int *),
+                      const int iv) {
+  _xteam_reduction<int, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ui_16x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                       uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *,
+                                       __XTEAM_SHARED_LDS uint32_t *),
+                       const uint32_t iv) {
+  _xteam_reduction<uint32_t, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_l_16x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                      void (*_rf)(long *, long),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS long *,
+                                      __XTEAM_SHARED_LDS long *),
+                      const long iv) {
+  _xteam_reduction<long, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ul_16x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                       uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *,
+                                       __XTEAM_SHARED_LDS uint64_t *),
+                       const uint64_t iv) {
+  _xteam_reduction<uint64_t, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+_EXT_ATTR
+__kmpc_xteamr_d_8x64(double v, double *r_ptr, double *tvals, uint32_t *td_ptr,
+                     void (*_rf)(double *, double),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS double *,
+                                     __XTEAM_SHARED_LDS double *),
+                     const double iv) {
+  _xteam_reduction<double, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_f_8x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                     void (*_rf)(float *, float),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS float *,
+                                     __XTEAM_SHARED_LDS float *),
+                     const float iv) {
+  _xteam_reduction<float, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cd_8x64(double _Complex v, double _Complex *r_ptr,
+                      double _Complex *tvals, uint32_t *td_ptr,
+                      void (*_rf)(double _Complex *, double _Complex),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *,
+                                      __XTEAM_SHARED_LDS double _Complex *),
+                      const double _Complex iv) {
+  _xteam_reduction<double _Complex, 8, 64>(v, r_ptr, tvals, td_ptr, _rf,
+                                           _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cf_8x64(float _Complex v, float _Complex *r_ptr,
+                      float _Complex *tvals, uint32_t *td_ptr,
+                      void (*_rf)(float _Complex *, float _Complex),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *,
+                                      __XTEAM_SHARED_LDS float _Complex *),
+                      const float _Complex iv) {
+  _xteam_reduction<float _Complex, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds,
+                                          iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_i_8x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                     void (*_rf)(int *, int),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS int *,
+                                     __XTEAM_SHARED_LDS int *),
+                     const int iv) {
+  _xteam_reduction<int, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ui_8x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                      uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *,
+                                      __XTEAM_SHARED_LDS uint32_t *),
+                      const uint32_t iv) {
+  _xteam_reduction<uint32_t, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_l_8x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                     void (*_rf)(long *, long),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS long *,
+                                     __XTEAM_SHARED_LDS long *),
+                     const long iv) {
+  _xteam_reduction<long, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ul_8x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                      uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *,
+                                      __XTEAM_SHARED_LDS uint64_t *),
+                      const uint64_t iv) {
+  _xteam_reduction<uint64_t, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+_EXT_ATTR
+__kmpc_xteamr_d_4x64(double v, double *r_ptr, double *tvals, uint32_t *td_ptr,
+                     void (*_rf)(double *, double),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS double *,
+                                     __XTEAM_SHARED_LDS double *),
+                     const double iv) {
+  _xteam_reduction<double, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_f_4x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                     void (*_rf)(float *, float),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS float *,
+                                     __XTEAM_SHARED_LDS float *),
+                     const float iv) {
+  _xteam_reduction<float, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cd_4x64(double _Complex v, double _Complex *r_ptr,
+                      double _Complex *tvals, uint32_t *td_ptr,
+                      void (*_rf)(double _Complex *, double _Complex),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *,
+                                      __XTEAM_SHARED_LDS double _Complex *),
+                      const double _Complex iv) {
+  _xteam_reduction<double _Complex, 4, 64>(v, r_ptr, tvals, td_ptr, _rf,
+                                           _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cf_4x64(float _Complex v, float _Complex *r_ptr,
+                      float _Complex *tvals, uint32_t *td_ptr,
+                      void (*_rf)(float _Complex *, float _Complex),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *,
+                                      __XTEAM_SHARED_LDS float _Complex *),
+                      const float _Complex iv) {
+  _xteam_reduction<float _Complex, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds,
+                                          iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_i_4x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                     void (*_rf)(int *, int),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS int *,
+                                     __XTEAM_SHARED_LDS int *),
+                     const int iv) {
+  _xteam_reduction<int, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ui_4x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                      uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *,
+                                      __XTEAM_SHARED_LDS uint32_t *),
+                      const uint32_t iv) {
+  _xteam_reduction<uint32_t, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_l_4x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                     void (*_rf)(long *, long),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS long *,
+                                     __XTEAM_SHARED_LDS long *),
+                     const long iv) {
+  _xteam_reduction<long, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ul_4x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                      uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *,
+                                      __XTEAM_SHARED_LDS uint64_t *),
+                      const uint64_t iv) {
+  _xteam_reduction<uint64_t, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+_EXT_ATTR
+__kmpc_xteamr_d_32x32(double v, double *r_ptr, double *tvals, uint32_t *td_ptr,
+                      void (*_rf)(double *, double),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS double *,
+                                      __XTEAM_SHARED_LDS double *),
+                      const double iv) {
+  _xteam_reduction<double, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_f_32x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                      void (*_rf)(float *, float),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS float *,
+                                      __XTEAM_SHARED_LDS float *),
+                      const float iv) {
+  _xteam_reduction<float, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cd_32x32(double _Complex v, double _Complex *r_ptr,
+                       double _Complex *tvals, uint32_t *td_ptr,
+                       void (*_rf)(double _Complex *, double _Complex),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *,
+                                       __XTEAM_SHARED_LDS double _Complex *),
+                       const double _Complex iv) {
+  _xteam_reduction<double _Complex, 32, 32>(v, r_ptr, tvals, td_ptr, _rf,
+                                            _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cf_32x32(float _Complex v, float _Complex *r_ptr,
+                       float _Complex *tvals, uint32_t *td_ptr,
+                       void (*_rf)(float _Complex *, float _Complex),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *,
+                                       __XTEAM_SHARED_LDS float _Complex *),
+                       const float _Complex iv) {
+  _xteam_reduction<float _Complex, 32, 32>(v, r_ptr, tvals, td_ptr, _rf,
+                                           _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_i_32x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                      void (*_rf)(int *, int),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS int *,
+                                      __XTEAM_SHARED_LDS int *),
+                      const int iv) {
+  _xteam_reduction<int, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ui_32x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                       uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *,
+                                       __XTEAM_SHARED_LDS uint32_t *),
+                       const uint32_t iv) {
+  _xteam_reduction<uint32_t, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_l_32x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                      void (*_rf)(long *, long),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS long *,
+                                      __XTEAM_SHARED_LDS long *),
+                      const long iv) {
+  _xteam_reduction<long, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ul_32x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                       uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *,
+                                       __XTEAM_SHARED_LDS uint64_t *),
+                       const uint64_t iv) {
+  _xteam_reduction<uint64_t, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+_EXT_ATTR
+__kmpc_xteamr_d_16x32(double v, double *r_ptr, double *tvals, uint32_t *td_ptr,
+                      void (*_rf)(double *, double),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS double *,
+                                      __XTEAM_SHARED_LDS double *),
+                      const double iv) {
+  _xteam_reduction<double, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_f_16x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                      void (*_rf)(float *, float),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS float *,
+                                      __XTEAM_SHARED_LDS float *),
+                      const float iv) {
+  _xteam_reduction<float, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cd_16x32(double _Complex v, double _Complex *r_ptr,
+                       double _Complex *tvals, uint32_t *td_ptr,
+                       void (*_rf)(double _Complex *, double _Complex),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *,
+                                       __XTEAM_SHARED_LDS double _Complex *),
+                       const double _Complex iv) {
+  _xteam_reduction<double _Complex, 16, 32>(v, r_ptr, tvals, td_ptr, _rf,
+                                            _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cf_16x32(float _Complex v, float _Complex *r_ptr,
+                       float _Complex *tvals, uint32_t *td_ptr,
+                       void (*_rf)(float _Complex *, float _Complex),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *,
+                                       __XTEAM_SHARED_LDS float _Complex *),
+                       const float _Complex iv) {
+  _xteam_reduction<float _Complex, 16, 32>(v, r_ptr, tvals, td_ptr, _rf,
+                                           _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_i_16x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                      void (*_rf)(int *, int),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS int *,
+                                      __XTEAM_SHARED_LDS int *),
+                      const int iv) {
+  _xteam_reduction<int, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ui_16x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                       uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *,
+                                       __XTEAM_SHARED_LDS uint32_t *),
+                       const uint32_t iv) {
+  _xteam_reduction<uint32_t, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_l_16x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                      void (*_rf)(long *, long),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS long *,
+                                      __XTEAM_SHARED_LDS long *),
+                      const long iv) {
+  _xteam_reduction<long, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ul_16x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                       uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *,
+                                       __XTEAM_SHARED_LDS uint64_t *),
+                       const uint64_t iv) {
+  _xteam_reduction<uint64_t, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+_EXT_ATTR
+__kmpc_xteamr_d_8x32(double v, double *r_ptr, double *tvals, uint32_t *td_ptr,
+                     void (*_rf)(double *, double),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS double *,
+                                     __XTEAM_SHARED_LDS double *),
+                     const double iv) {
+  _xteam_reduction<double, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_f_8x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                     void (*_rf)(float *, float),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS float *,
+                                     __XTEAM_SHARED_LDS float *),
+                     const float iv) {
+  _xteam_reduction<float, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cd_8x32(double _Complex v, double _Complex *r_ptr,
+                      double _Complex *tvals, uint32_t *td_ptr,
+                      void (*_rf)(double _Complex *, double _Complex),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *,
+                                      __XTEAM_SHARED_LDS double _Complex *),
+                      const double _Complex iv) {
+  _xteam_reduction<double _Complex, 8, 32>(v, r_ptr, tvals, td_ptr, _rf,
+                                           _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cf_8x32(float _Complex v, float _Complex *r_ptr,
+                      float _Complex *tvals, uint32_t *td_ptr,
+                      void (*_rf)(float _Complex *, float _Complex),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *,
+                                      __XTEAM_SHARED_LDS float _Complex *),
+                      const float _Complex iv) {
+  _xteam_reduction<float _Complex, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds,
+                                          iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_i_8x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                     void (*_rf)(int *, int),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS int *,
+                                     __XTEAM_SHARED_LDS int *),
+                     const int iv) {
+  _xteam_reduction<int, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ui_8x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                      uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *,
+                                      __XTEAM_SHARED_LDS uint32_t *),
+                      const uint32_t iv) {
+  _xteam_reduction<uint32_t, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_l_8x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                     void (*_rf)(long *, long),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS long *,
+                                     __XTEAM_SHARED_LDS long *),
+                     const long iv) {
+  _xteam_reduction<long, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ul_8x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                      uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *,
+                                      __XTEAM_SHARED_LDS uint64_t *),
+                      const uint64_t iv) {
+  _xteam_reduction<uint64_t, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+#undef _EXT_ATTR
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/test/xteamr/README b/openmp/libomptarget/test/xteamr/README
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/xteamr/README
@@ -0,0 +1,78 @@
+//===------- README for llvm-project/openmp/libomptarget/test/xteamr ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+This xteamr test executes simulated reductions using the high performance 
+DeviceRTL xteam helper functions and compares to the existing OpenMP reduction.
+
+The goal of this smoke test is to ensure that the DeviceRTL helper functions
+in Xteamr.cpp are correct and performant. This test (xteamr) will fail
+(return a nonzero return code) if either the openmp reduction or the simulated
+ reduction get the wrong answer.
+
+The simulated functions sim_dot, sim_max, and sim_min show what clang would
+generate when it encounters an omp target pragma with one or more sum 
+reductions.  See example below. 
+
+The clang CodeGen changes are not yet complete. 
+
+The interfaces to the helper functions that clang codegen is expected to generate
+are contained in this test source. For building DeviceRTL the interfaces and
+the documentation can be found in:
+
+  llvm-project/openmp/libomptarget/DeviceRTL/include/Interface.h
+
+The definitions of all xteamr functions can be found in 
+
+  llvm-project/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp
+
+The builtin simple pair reducing functions can be found here:
+
+  llvm-project/openmp/libomptarget/DeviceRTL/src/Rfuns.cpp
+
+EXAMPLE: Given this code:
+
+      double sum = 0.0;
+      #pragma omp target teams distribute parallel for map(tofrom: sum) reduction(+:sum)
+      for (int64_t i = 0; i < array_size; i++)
+        sum += a[i] * b[i];
+
+Clang should generate code equivalent to this. 
+
+      #define _NUM_THREADS 1024
+      devid = 0 ; // or default or whatever is on target construct
+      TEAM_PROCS = ompx_get_team_procs(devid);
+      team_vals0  = (double *) omp_target_alloc(sizeof(double) * TEAM_PROCS, devid);
+      teams_done_ptr0 = (uint32_t *) omp_target_alloc(sizeof(uint32_t),devid);
+      uint32_t zero = 0;
+      omp_target_memcpy(teams_done_ptr0, &zero , sizeof(uint32_t), 0,
+                      0, devid, omp_get_initial_device());
+      #pragma omp target teams distribute parallel for \
+         num_teams(TEAM_PROCS) num_threads(_NUM_THREADS) \
+         map(tofrom:sum) is_device_ptr(team_vals0,teams_done_ptr0)
+      for (unsigned int k=0; k<(LOOP_TEAMS*_NUM_THREADS) ; k++) {
+        T val0 = 0.0;
+        LOOP_STRIDE=1; LOOP_START=0; LOOP_SIZE = array_size;
+        for (int64_t i = ((k * LOOP_STRIDE) + LOOP_START); i < LOOP_SIZE;
+          i += (TEAM_PROCS * _NUM_THREADS * LOOP_STRIDE))
+        {
+          val0 += a[i] * b[i];
+        }
+        __kmpc_xteamr_d_16x64(val0, &sum, team_vals0, teams_done_ptr0, 
+             __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d,0.0);
+      }
+
+If processor has 32 bit warp_size change __kmpc_xteamr_d_16x64 to __kmpc_xteamr_d_32x32
+The test script compile_run tests all configs of 1024, 512, and 256 threads
+For very large arrays, the best performance is with 1024 threads.
+If array_size is constant and known to be less than TEAM_PROCS*_NUM_THREADS,
+the codegen may uses a smaller configuration of either 512 or 256 threads. 
+
+To compile and execute
+export LLVM_INSTALL=<path to LLVM installation>
+export OFFLOAD_ARCH=gfx90a
+./compile_run
diff --git a/openmp/libomptarget/test/xteamr/header b/openmp/libomptarget/test/xteamr/header
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/xteamr/header
@@ -0,0 +1,9 @@
+//== overload_insts_1024.h overloaded instatiations of Xteamr fcts -C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file contains all instantiations of functions using 1024 threads
+//===----------------------------------------------------------------------===//
diff --git a/openmp/libomptarget/test/xteamr/overload_insts_1024.h b/openmp/libomptarget/test/xteamr/overload_insts_1024.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/xteamr/overload_insts_1024.h
@@ -0,0 +1,279 @@
+//== overload_insts_1024.h overloaded instatiations of Xteamr fcts -C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file contains all instantiations of functions using 1024 threads
+//===----------------------------------------------------------------------===//
+
+// ---- Local overloads for testing sum
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x64(double val, double *rval, double *xteam_mem,
+                     unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_d,
+                        __kmpc_rfun_sum_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x64(float val, float *rval, float *xteam_mem,
+                     unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_f,
+                        __kmpc_rfun_sum_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x64(double _Complex val, double _Complex *rval,
+                     double _Complex *xteam_mem, unsigned int *td_ptr,
+                     const double _Complex initval) {
+  __kmpc_xteamr_cd_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cd,
+                         __kmpc_rfun_sum_lds_cd, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x64(float _Complex val, float _Complex *rval,
+                     float _Complex *xteam_mem, unsigned int *td_ptr,
+                     const float _Complex initval) {
+  __kmpc_xteamr_cf_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cf,
+                         __kmpc_rfun_sum_lds_cf, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                     const int initval) {
+  __kmpc_xteamr_i_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_i,
+                        __kmpc_rfun_sum_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x64(unsigned int val, unsigned int *rval,
+                     unsigned int *xteam_mem, unsigned int *td_ptr,
+                     const unsigned int initval) {
+  __kmpc_xteamr_ui_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ui,
+                         __kmpc_rfun_sum_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x64(long val, long *rval, long *xteam_mem,
+                     unsigned int *td_ptr, const long initval) {
+  __kmpc_xteamr_l_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_l,
+                        __kmpc_rfun_sum_lds_l, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x64(unsigned long val, unsigned long *rval,
+                     unsigned long *xteam_mem, unsigned int *td_ptr,
+                     const unsigned long initval) {
+  __kmpc_xteamr_ul_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ul,
+                         __kmpc_rfun_sum_lds_ul, initval);
+}
+
+// ---- Local overloads for testing max
+void __attribute__((flatten, always_inline))
+max_overloaded_16x64(double val, double *rval, double *xteam_mem,
+                     unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_d,
+                        __kmpc_rfun_max_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_16x64(float val, float *rval, float *xteam_mem,
+                     unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_f,
+                        __kmpc_rfun_max_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_16x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                     const int initval) {
+  __kmpc_xteamr_i_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_i,
+                        __kmpc_rfun_max_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_16x64(unsigned int val, unsigned int *rval,
+                     unsigned int *xteam_mem, unsigned int *td_ptr,
+                     const unsigned int initval) {
+  __kmpc_xteamr_ui_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ui,
+                         __kmpc_rfun_max_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_16x64(long val, long *rval, long *xteam_mem,
+                     unsigned int *td_ptr, const long initval) {
+  __kmpc_xteamr_l_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_l,
+                        __kmpc_rfun_max_lds_l, initval);
+}
+
+void __attribute__((flatten, always_inline))
+max_overloaded_16x64(unsigned long val, unsigned long *rval,
+                     unsigned long *xteam_mem, unsigned int *td_ptr,
+                     const unsigned long initval) {
+  __kmpc_xteamr_ul_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ul,
+                         __kmpc_rfun_max_lds_ul, initval);
+}
+
+// ---- Local overloads for testing min
+void __attribute__((flatten, always_inline))
+min_overloaded_16x64(double val, double *rval, double *xteam_mem,
+                     unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_d,
+                        __kmpc_rfun_min_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_16x64(float val, float *rval, float *xteam_mem,
+                     unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_f,
+                        __kmpc_rfun_min_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_16x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                     const int initval) {
+  __kmpc_xteamr_i_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_i,
+                        __kmpc_rfun_min_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_16x64(unsigned int val, unsigned int *rval,
+                     unsigned int *xteam_mem, unsigned int *td_ptr,
+                     const unsigned int initval) {
+  __kmpc_xteamr_ui_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ui,
+                         __kmpc_rfun_min_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_16x64(long val, long *rval, long *xteam_mem,
+                     unsigned int *td_ptr, const long initval) {
+  __kmpc_xteamr_l_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_l,
+                        __kmpc_rfun_min_lds_l, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_16x64(unsigned long val, unsigned long *rval,
+                     unsigned long *xteam_mem, unsigned int *td_ptr,
+                     const unsigned long initval) {
+  __kmpc_xteamr_ul_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ul,
+                         __kmpc_rfun_min_lds_ul, initval);
+}
+
+// ---- Local overloads for testing sum with 32 item warps
+void __attribute__((flatten, always_inline))
+sum_overloaded_32x32(double val, double *rval, double *xteam_mem,
+                     unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_d,
+                        __kmpc_rfun_sum_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_32x32(float val, float *rval, float *xteam_mem,
+                     unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_f,
+                        __kmpc_rfun_sum_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_32x32(double _Complex val, double _Complex *rval,
+                     double _Complex *xteam_mem, unsigned int *td_ptr,
+                     const double _Complex initval) {
+  __kmpc_xteamr_cd_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cd,
+                         __kmpc_rfun_sum_lds_cd, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_32x32(float _Complex val, float _Complex *rval,
+                     float _Complex *xteam_mem, unsigned int *td_ptr,
+                     const float _Complex initval) {
+  __kmpc_xteamr_cf_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cf,
+                         __kmpc_rfun_sum_lds_cf, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_32x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                     const int initval) {
+  __kmpc_xteamr_i_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_i,
+                        __kmpc_rfun_sum_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_32x32(unsigned int val, unsigned int *rval,
+                     unsigned int *xteam_mem, unsigned int *td_ptr,
+                     const unsigned int initval) {
+  __kmpc_xteamr_ui_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ui,
+                         __kmpc_rfun_sum_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_32x32(long val, long *rval, long *xteam_mem,
+                     unsigned int *td_ptr, const long initval) {
+  __kmpc_xteamr_l_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_l,
+                        __kmpc_rfun_sum_lds_l, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_32x32(unsigned long val, unsigned long *rval,
+                     unsigned long *xteam_mem, unsigned int *td_ptr,
+                     const unsigned long initval) {
+  __kmpc_xteamr_ul_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ul,
+                         __kmpc_rfun_sum_lds_ul, initval);
+}
+
+// ---- Local overloads for testing max
+void __attribute__((flatten, always_inline))
+max_overloaded_32x32(double val, double *rval, double *xteam_mem,
+                     unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_d,
+                        __kmpc_rfun_max_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_32x32(float val, float *rval, float *xteam_mem,
+                     unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_f,
+                        __kmpc_rfun_max_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_32x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                     const int initval) {
+  __kmpc_xteamr_i_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_i,
+                        __kmpc_rfun_max_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_32x32(unsigned int val, unsigned int *rval,
+                     unsigned int *xteam_mem, unsigned int *td_ptr,
+                     const unsigned int initval) {
+  __kmpc_xteamr_ui_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ui,
+                         __kmpc_rfun_max_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_32x32(long val, long *rval, long *xteam_mem,
+                     unsigned int *td_ptr, const long initval) {
+  __kmpc_xteamr_l_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_l,
+                        __kmpc_rfun_max_lds_l, initval);
+}
+
+void __attribute__((flatten, always_inline))
+max_overloaded_32x32(unsigned long val, unsigned long *rval,
+                     unsigned long *xteam_mem, unsigned int *td_ptr,
+                     const unsigned long initval) {
+  __kmpc_xteamr_ul_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ul,
+                         __kmpc_rfun_max_lds_ul, initval);
+}
+
+// ---- Local overloads for testing min
+void __attribute__((flatten, always_inline))
+min_overloaded_32x32(double val, double *rval, double *xteam_mem,
+                     unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_d,
+                        __kmpc_rfun_min_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_32x32(float val, float *rval, float *xteam_mem,
+                     unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_f,
+                        __kmpc_rfun_min_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_32x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                     const int initval) {
+  __kmpc_xteamr_i_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_i,
+                        __kmpc_rfun_min_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_32x32(unsigned int val, unsigned int *rval,
+                     unsigned int *xteam_mem, unsigned int *td_ptr,
+                     const unsigned int initval) {
+  __kmpc_xteamr_ui_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ui,
+                         __kmpc_rfun_min_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_32x32(long val, long *rval, long *xteam_mem,
+                     unsigned int *td_ptr, const long initval) {
+  __kmpc_xteamr_l_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_l,
+                        __kmpc_rfun_min_lds_l, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_32x32(unsigned long val, unsigned long *rval,
+                     unsigned long *xteam_mem, unsigned int *td_ptr,
+                     const unsigned long initval) {
+  __kmpc_xteamr_ul_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ul,
+                         __kmpc_rfun_min_lds_ul, initval);
+}
diff --git a/openmp/libomptarget/test/xteamr/overload_insts_256.h b/openmp/libomptarget/test/xteamr/overload_insts_256.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/xteamr/overload_insts_256.h
@@ -0,0 +1,279 @@
+//=== overload_insts_256.h overloaded instatiations of Xteamr fcts -C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file contains all instantiations of functions using 256 threads
+//===----------------------------------------------------------------------===//
+
+// ---- Local overloads for testing sum
+void __attribute__((flatten, always_inline))
+sum_overloaded_4x64(double val, double *rval, double *xteam_mem,
+                    unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_d,
+                       __kmpc_rfun_sum_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_4x64(float val, float *rval, float *xteam_mem,
+                    unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_f,
+                       __kmpc_rfun_sum_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_4x64(double _Complex val, double _Complex *rval,
+                    double _Complex *xteam_mem, unsigned int *td_ptr,
+                    const double _Complex initval) {
+  __kmpc_xteamr_cd_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cd,
+                        __kmpc_rfun_sum_lds_cd, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_4x64(float _Complex val, float _Complex *rval,
+                    float _Complex *xteam_mem, unsigned int *td_ptr,
+                    const float _Complex initval) {
+  __kmpc_xteamr_cf_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cf,
+                        __kmpc_rfun_sum_lds_cf, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_4x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                    const int initval) {
+  __kmpc_xteamr_i_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_i,
+                       __kmpc_rfun_sum_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_4x64(unsigned int val, unsigned int *rval,
+                    unsigned int *xteam_mem, unsigned int *td_ptr,
+                    const unsigned int initval) {
+  __kmpc_xteamr_ui_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ui,
+                        __kmpc_rfun_sum_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_4x64(long val, long *rval, long *xteam_mem, unsigned int *td_ptr,
+                    const long initval) {
+  __kmpc_xteamr_l_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_l,
+                       __kmpc_rfun_sum_lds_l, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_4x64(unsigned long val, unsigned long *rval,
+                    unsigned long *xteam_mem, unsigned int *td_ptr,
+                    const unsigned long initval) {
+  __kmpc_xteamr_ul_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ul,
+                        __kmpc_rfun_sum_lds_ul, initval);
+}
+
+// ---- Local overloads for testing max
+void __attribute__((flatten, always_inline))
+max_overloaded_4x64(double val, double *rval, double *xteam_mem,
+                    unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_d,
+                       __kmpc_rfun_max_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_4x64(float val, float *rval, float *xteam_mem,
+                    unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_f,
+                       __kmpc_rfun_max_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_4x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                    const int initval) {
+  __kmpc_xteamr_i_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_i,
+                       __kmpc_rfun_max_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_4x64(unsigned int val, unsigned int *rval,
+                    unsigned int *xteam_mem, unsigned int *td_ptr,
+                    const unsigned int initval) {
+  __kmpc_xteamr_ui_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ui,
+                        __kmpc_rfun_max_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_4x64(long val, long *rval, long *xteam_mem, unsigned int *td_ptr,
+                    const long initval) {
+  __kmpc_xteamr_l_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_l,
+                       __kmpc_rfun_max_lds_l, initval);
+}
+
+void __attribute__((flatten, always_inline))
+max_overloaded_4x64(unsigned long val, unsigned long *rval,
+                    unsigned long *xteam_mem, unsigned int *td_ptr,
+                    const unsigned long initval) {
+  __kmpc_xteamr_ul_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ul,
+                        __kmpc_rfun_max_lds_ul, initval);
+}
+
+// ---- Local overloads for testing min
+void __attribute__((flatten, always_inline))
+min_overloaded_4x64(double val, double *rval, double *xteam_mem,
+                    unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_d,
+                       __kmpc_rfun_min_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_4x64(float val, float *rval, float *xteam_mem,
+                    unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_f,
+                       __kmpc_rfun_min_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_4x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                    const int initval) {
+  __kmpc_xteamr_i_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_i,
+                       __kmpc_rfun_min_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_4x64(unsigned int val, unsigned int *rval,
+                    unsigned int *xteam_mem, unsigned int *td_ptr,
+                    const unsigned int initval) {
+  __kmpc_xteamr_ui_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ui,
+                        __kmpc_rfun_min_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_4x64(long val, long *rval, long *xteam_mem, unsigned int *td_ptr,
+                    const long initval) {
+  __kmpc_xteamr_l_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_l,
+                       __kmpc_rfun_min_lds_l, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_4x64(unsigned long val, unsigned long *rval,
+                    unsigned long *xteam_mem, unsigned int *td_ptr,
+                    const unsigned long initval) {
+  __kmpc_xteamr_ul_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ul,
+                        __kmpc_rfun_min_lds_ul, initval);
+}
+
+// ---- Local overloads for testing sum with 32 item warps
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x32(double val, double *rval, double *xteam_mem,
+                    unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_d,
+                       __kmpc_rfun_sum_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x32(float val, float *rval, float *xteam_mem,
+                    unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_f,
+                       __kmpc_rfun_sum_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x32(double _Complex val, double _Complex *rval,
+                    double _Complex *xteam_mem, unsigned int *td_ptr,
+                    const double _Complex initval) {
+  __kmpc_xteamr_cd_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cd,
+                        __kmpc_rfun_sum_lds_cd, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x32(float _Complex val, float _Complex *rval,
+                    float _Complex *xteam_mem, unsigned int *td_ptr,
+                    const float _Complex initval) {
+  __kmpc_xteamr_cf_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cf,
+                        __kmpc_rfun_sum_lds_cf, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                    const int initval) {
+  __kmpc_xteamr_i_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_i,
+                       __kmpc_rfun_sum_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x32(unsigned int val, unsigned int *rval,
+                    unsigned int *xteam_mem, unsigned int *td_ptr,
+                    const unsigned int initval) {
+  __kmpc_xteamr_ui_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ui,
+                        __kmpc_rfun_sum_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x32(long val, long *rval, long *xteam_mem, unsigned int *td_ptr,
+                    const long initval) {
+  __kmpc_xteamr_l_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_l,
+                       __kmpc_rfun_sum_lds_l, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x32(unsigned long val, unsigned long *rval,
+                    unsigned long *xteam_mem, unsigned int *td_ptr,
+                    const unsigned long initval) {
+  __kmpc_xteamr_ul_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ul,
+                        __kmpc_rfun_sum_lds_ul, initval);
+}
+
+// ---- Local overloads for testing max
+void __attribute__((flatten, always_inline))
+max_overloaded_8x32(double val, double *rval, double *xteam_mem,
+                    unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_d,
+                       __kmpc_rfun_max_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_8x32(float val, float *rval, float *xteam_mem,
+                    unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_f,
+                       __kmpc_rfun_max_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_8x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                    const int initval) {
+  __kmpc_xteamr_i_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_i,
+                       __kmpc_rfun_max_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_8x32(unsigned int val, unsigned int *rval,
+                    unsigned int *xteam_mem, unsigned int *td_ptr,
+                    const unsigned int initval) {
+  __kmpc_xteamr_ui_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ui,
+                        __kmpc_rfun_max_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_8x32(long val, long *rval, long *xteam_mem, unsigned int *td_ptr,
+                    const long initval) {
+  __kmpc_xteamr_l_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_l,
+                       __kmpc_rfun_max_lds_l, initval);
+}
+
+void __attribute__((flatten, always_inline))
+max_overloaded_8x32(unsigned long val, unsigned long *rval,
+                    unsigned long *xteam_mem, unsigned int *td_ptr,
+                    const unsigned long initval) {
+  __kmpc_xteamr_ul_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ul,
+                        __kmpc_rfun_max_lds_ul, initval);
+}
+
+// ---- Local overloads for testing min
+void __attribute__((flatten, always_inline))
+min_overloaded_8x32(double val, double *rval, double *xteam_mem,
+                    unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_d,
+                       __kmpc_rfun_min_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_8x32(float val, float *rval, float *xteam_mem,
+                    unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_f,
+                       __kmpc_rfun_min_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_8x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                    const int initval) {
+  __kmpc_xteamr_i_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_i,
+                       __kmpc_rfun_min_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_8x32(unsigned int val, unsigned int *rval,
+                    unsigned int *xteam_mem, unsigned int *td_ptr,
+                    const unsigned int initval) {
+  __kmpc_xteamr_ui_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ui,
+                        __kmpc_rfun_min_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_8x32(long val, long *rval, long *xteam_mem, unsigned int *td_ptr,
+                    const long initval) {
+  __kmpc_xteamr_l_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_l,
+                       __kmpc_rfun_min_lds_l, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_8x32(unsigned long val, unsigned long *rval,
+                    unsigned long *xteam_mem, unsigned int *td_ptr,
+                    const unsigned long initval) {
+  __kmpc_xteamr_ul_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ul,
+                        __kmpc_rfun_min_lds_ul, initval);
+}
diff --git a/openmp/libomptarget/test/xteamr/overload_insts_512.h b/openmp/libomptarget/test/xteamr/overload_insts_512.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/xteamr/overload_insts_512.h
@@ -0,0 +1,278 @@
+//=== overload_insts_512.h overloaded instatiations of Xteamr fcts -C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file contains all instantiations of functions using 512 threads
+//===----------------------------------------------------------------------===//
+// ---- Local overloads for testing sum
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x64(double val, double *rval, double *xteam_mem,
+                    unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_d,
+                       __kmpc_rfun_sum_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x64(float val, float *rval, float *xteam_mem,
+                    unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_f,
+                       __kmpc_rfun_sum_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x64(double _Complex val, double _Complex *rval,
+                    double _Complex *xteam_mem, unsigned int *td_ptr,
+                    const double _Complex initval) {
+  __kmpc_xteamr_cd_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cd,
+                        __kmpc_rfun_sum_lds_cd, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x64(float _Complex val, float _Complex *rval,
+                    float _Complex *xteam_mem, unsigned int *td_ptr,
+                    const float _Complex initval) {
+  __kmpc_xteamr_cf_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cf,
+                        __kmpc_rfun_sum_lds_cf, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                    const int initval) {
+  __kmpc_xteamr_i_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_i,
+                       __kmpc_rfun_sum_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x64(unsigned int val, unsigned int *rval,
+                    unsigned int *xteam_mem, unsigned int *td_ptr,
+                    const unsigned int initval) {
+  __kmpc_xteamr_ui_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ui,
+                        __kmpc_rfun_sum_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x64(long val, long *rval, long *xteam_mem, unsigned int *td_ptr,
+                    const long initval) {
+  __kmpc_xteamr_l_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_l,
+                       __kmpc_rfun_sum_lds_l, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_8x64(unsigned long val, unsigned long *rval,
+                    unsigned long *xteam_mem, unsigned int *td_ptr,
+                    const unsigned long initval) {
+  __kmpc_xteamr_ul_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ul,
+                        __kmpc_rfun_sum_lds_ul, initval);
+}
+
+// ---- Local overloads for testing max
+void __attribute__((flatten, always_inline))
+max_overloaded_8x64(double val, double *rval, double *xteam_mem,
+                    unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_d,
+                       __kmpc_rfun_max_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_8x64(float val, float *rval, float *xteam_mem,
+                    unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_f,
+                       __kmpc_rfun_max_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_8x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                    const int initval) {
+  __kmpc_xteamr_i_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_i,
+                       __kmpc_rfun_max_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_8x64(unsigned int val, unsigned int *rval,
+                    unsigned int *xteam_mem, unsigned int *td_ptr,
+                    const unsigned int initval) {
+  __kmpc_xteamr_ui_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ui,
+                        __kmpc_rfun_max_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_8x64(long val, long *rval, long *xteam_mem, unsigned int *td_ptr,
+                    const long initval) {
+  __kmpc_xteamr_l_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_l,
+                       __kmpc_rfun_max_lds_l, initval);
+}
+
+void __attribute__((flatten, always_inline))
+max_overloaded_8x64(unsigned long val, unsigned long *rval,
+                    unsigned long *xteam_mem, unsigned int *td_ptr,
+                    const unsigned long initval) {
+  __kmpc_xteamr_ul_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ul,
+                        __kmpc_rfun_max_lds_ul, initval);
+}
+
+// ---- Local overloads for testing min
+void __attribute__((flatten, always_inline))
+min_overloaded_8x64(double val, double *rval, double *xteam_mem,
+                    unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_d,
+                       __kmpc_rfun_min_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_8x64(float val, float *rval, float *xteam_mem,
+                    unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_f,
+                       __kmpc_rfun_min_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_8x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                    const int initval) {
+  __kmpc_xteamr_i_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_i,
+                       __kmpc_rfun_min_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_8x64(unsigned int val, unsigned int *rval,
+                    unsigned int *xteam_mem, unsigned int *td_ptr,
+                    const unsigned int initval) {
+  __kmpc_xteamr_ui_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ui,
+                        __kmpc_rfun_min_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_8x64(long val, long *rval, long *xteam_mem, unsigned int *td_ptr,
+                    const long initval) {
+  __kmpc_xteamr_l_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_l,
+                       __kmpc_rfun_min_lds_l, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_8x64(unsigned long val, unsigned long *rval,
+                    unsigned long *xteam_mem, unsigned int *td_ptr,
+                    const unsigned long initval) {
+  __kmpc_xteamr_ul_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ul,
+                        __kmpc_rfun_min_lds_ul, initval);
+}
+
+// ---- Local overloads for testing sum with 32 item warps
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x32(double val, double *rval, double *xteam_mem,
+                     unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_d,
+                        __kmpc_rfun_sum_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x32(float val, float *rval, float *xteam_mem,
+                     unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_f,
+                        __kmpc_rfun_sum_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x32(double _Complex val, double _Complex *rval,
+                     double _Complex *xteam_mem, unsigned int *td_ptr,
+                     const double _Complex initval) {
+  __kmpc_xteamr_cd_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cd,
+                         __kmpc_rfun_sum_lds_cd, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x32(float _Complex val, float _Complex *rval,
+                     float _Complex *xteam_mem, unsigned int *td_ptr,
+                     const float _Complex initval) {
+  __kmpc_xteamr_cf_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cf,
+                         __kmpc_rfun_sum_lds_cf, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                     const int initval) {
+  __kmpc_xteamr_i_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_i,
+                        __kmpc_rfun_sum_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x32(unsigned int val, unsigned int *rval,
+                     unsigned int *xteam_mem, unsigned int *td_ptr,
+                     const unsigned int initval) {
+  __kmpc_xteamr_ui_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ui,
+                         __kmpc_rfun_sum_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x32(long val, long *rval, long *xteam_mem,
+                     unsigned int *td_ptr, const long initval) {
+  __kmpc_xteamr_l_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_l,
+                        __kmpc_rfun_sum_lds_l, initval);
+}
+void __attribute__((flatten, always_inline))
+sum_overloaded_16x32(unsigned long val, unsigned long *rval,
+                     unsigned long *xteam_mem, unsigned int *td_ptr,
+                     const unsigned long initval) {
+  __kmpc_xteamr_ul_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ul,
+                         __kmpc_rfun_sum_lds_ul, initval);
+}
+
+// ---- Local overloads for testing max
+void __attribute__((flatten, always_inline))
+max_overloaded_16x32(double val, double *rval, double *xteam_mem,
+                     unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_d,
+                        __kmpc_rfun_max_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_16x32(float val, float *rval, float *xteam_mem,
+                     unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_f,
+                        __kmpc_rfun_max_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_16x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                     const int initval) {
+  __kmpc_xteamr_i_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_i,
+                        __kmpc_rfun_max_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_16x32(unsigned int val, unsigned int *rval,
+                     unsigned int *xteam_mem, unsigned int *td_ptr,
+                     const unsigned int initval) {
+  __kmpc_xteamr_ui_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ui,
+                         __kmpc_rfun_max_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+max_overloaded_16x32(long val, long *rval, long *xteam_mem,
+                     unsigned int *td_ptr, const long initval) {
+  __kmpc_xteamr_l_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_l,
+                        __kmpc_rfun_max_lds_l, initval);
+}
+
+void __attribute__((flatten, always_inline))
+max_overloaded_16x32(unsigned long val, unsigned long *rval,
+                     unsigned long *xteam_mem, unsigned int *td_ptr,
+                     const unsigned long initval) {
+  __kmpc_xteamr_ul_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ul,
+                         __kmpc_rfun_max_lds_ul, initval);
+}
+
+// ---- Local overloads for testing min
+void __attribute__((flatten, always_inline))
+min_overloaded_16x32(double val, double *rval, double *xteam_mem,
+                     unsigned int *td_ptr, const double initval) {
+  __kmpc_xteamr_d_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_d,
+                        __kmpc_rfun_min_lds_d, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_16x32(float val, float *rval, float *xteam_mem,
+                     unsigned int *td_ptr, const float initval) {
+  __kmpc_xteamr_f_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_f,
+                        __kmpc_rfun_min_lds_f, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_16x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr,
+                     const int initval) {
+  __kmpc_xteamr_i_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_i,
+                        __kmpc_rfun_min_lds_i, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_16x32(unsigned int val, unsigned int *rval,
+                     unsigned int *xteam_mem, unsigned int *td_ptr,
+                     const unsigned int initval) {
+  __kmpc_xteamr_ui_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ui,
+                         __kmpc_rfun_min_lds_ui, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_16x32(long val, long *rval, long *xteam_mem,
+                     unsigned int *td_ptr, const long initval) {
+  __kmpc_xteamr_l_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_l,
+                        __kmpc_rfun_min_lds_l, initval);
+}
+void __attribute__((flatten, always_inline))
+min_overloaded_16x32(unsigned long val, unsigned long *rval,
+                     unsigned long *xteam_mem, unsigned int *td_ptr,
+                     const unsigned long initval) {
+  __kmpc_xteamr_ul_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ul,
+                         __kmpc_rfun_min_lds_ul, initval);
+}
diff --git a/openmp/libomptarget/test/xteamr/test_xteamr.sh b/openmp/libomptarget/test/xteamr/test_xteamr.sh
new file mode 100755
--- /dev/null
+++ b/openmp/libomptarget/test/xteamr/test_xteamr.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+#== overload_insts_1024.h overloaded instatiations of Xteamr fcts -C++ -*-===//
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===----------------------------------------------------------------------===//
+# 
+#  test_xteamr.sh: Script to test high performance reduction helper functions
+#                  in llvm-project/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp
+#                  It compiles and executes xteamr.cpp in 3 different configs.
+#
+#  See README file in this directory for more information.
+#
+#===----------------------------------------------------------------------===//
+
+LLVM_INSTALL=${LLVM_INSTALL:-$HOME/llvm}
+[ ! -f $LLVM_INSTALL/bin/clang ] && echo "ERROR: no LLVM install at $LLVM_INSTALL" && exit 1
+
+OFFLOAD_ARCH=${OFFLOAD_ARCH:-sm_70}
+
+tmpdir=/tmp/$USER/xteamr && mkdir -p $tmpdir
+[ ! -d $tmpdir ] && echo "ERROR: could not create $tmpdir"
+
+ARRAY_SIZE=${ARRAY_SIZE:-41943040}
+as_arg="-D_ARRAY_SIZE=$ARRAY_SIZE"
+
+cuda_args=""
+CUDA_INSTALL=${CUDA_INSTALL:-/usr/local/cuda}
+[ -d $CUDA_INSTALL ] && cudalib=$CUDA_INSTALL/targets/x86_64-linux/lib && export LD_LIBRARY_PATH=$cudalib && cuda_args="-L$cudalib -lcudart"
+
+nt_arg="-D_XTEAM_NUM_THREADS=1024"
+echo compile with $as_arg and $nt_arg
+$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg $nt_arg -fopenmp --offload-arch=$OFFLOAD_ARCH xteamr.cpp -o $tmpdir/xteamr_1024 $cuda_args -lstdc++ -latomic
+rc1=$?
+
+nt_arg="-D_XTEAM_NUM_THREADS=512"
+echo compile with $as_arg and $nt_arg
+$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg -fopenmp --offload-arch=$OFFLOAD_ARCH xteamr.cpp -o $tmpdir/xteamr_512 $cuda_args -lstdc++ -latomic
+rc2=$?
+
+nt_arg="-D_XTEAM_NUM_THREADS=256"
+echo compile with $as_arg and $nt_arg
+$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg -fopenmp --offload-arch=$OFFLOAD_ARCH xteamr.cpp -o $tmpdir/xteamr_256 $cuda_args -lstdc++ -latomic
+rc3=$?
+
+[ $rc1 == 0 ] && echo "START EXECUTE xteamr_1024" && $tmpdir/xteamr_1024 > $tmpdir/xteamr_1024.out
+rc4=$?
+[ $rc2 == 0 ] && echo "START EXECUTE xteamr_512" && $tmpdir/xteamr_512 > $tmpdir/xteamr_512.out
+rc5=$?
+[ $rc3 == 0 ] && echo "START EXECUTE xteamr_256" && $tmpdir/xteamr_256 > $tmpdir/xteamr_256.out
+rc6=$?
+
+echo 
+rc=$(( $rc1 + $rc2 + $rc3 + $rc4 + $rc5 + $rc6 ))
+if [ $rc != 0 ] ; then 
+  echo "ERRORS DETECTED!"
+else
+  echo "No errors detected"
+fi
+echo "Logs and binaries saved to $tmpdir"
+exit $rc
diff --git a/openmp/libomptarget/test/xteamr/xteamr.cpp b/openmp/libomptarget/test/xteamr/xteamr.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/xteamr/xteamr.cpp
@@ -0,0 +1,680 @@
+//===--------- xteamr.cpp - Test for Xteamr DeviceRTL functions ----C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// performance and functional tests for Xteamr reduction helper functions in
+// libomptarget/DeviceRTL/Xteamr.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <chrono>
+#include <complex>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <omp.h>
+#include <vector>
+
+//  Extern Xteamr functions are designed for 1024, 512, and 256 thread blocks.
+//  To change _XTEAM_NUM_TREADS to 512, change "16x64" names to "8x64" names,
+//  and change "32x32" names to "16x32" names.
+//  For 256 threads, change "16x64" names to "4x64", and "32x32" to "8x32".
+
+#ifndef _XTEAM_NUM_THREADS
+#define _XTEAM_NUM_THREADS 1024
+#endif
+
+#if _XTEAM_NUM_THREADS == 1024
+#define _SUM_OVERLOAD_64_FCT sum_overloaded_16x64
+#define _SUM_OVERLOAD_32_FCT sum_overloaded_32x32
+#define _MAX_OVERLOAD_64_FCT max_overloaded_16x64
+#define _MAX_OVERLOAD_32_FCT max_overloaded_32x32
+#define _MIN_OVERLOAD_64_FCT min_overloaded_16x64
+#define _MIN_OVERLOAD_32_FCT min_overloaded_32x32
+#define _OVERLOADED_INSTANTIATIONS "overload_insts_1024.h"
+#elif _XTEAM_NUM_THREADS == 512
+#define _SUM_OVERLOAD_64_FCT sum_overloaded_8x64
+#define _SUM_OVERLOAD_32_FCT sum_overloaded_16x32
+#define _MAX_OVERLOAD_64_FCT max_overloaded_8x64
+#define _MAX_OVERLOAD_32_FCT max_overloaded_16x32
+#define _MIN_OVERLOAD_64_FCT min_overloaded_8x64
+#define _MIN_OVERLOAD_32_FCT min_overloaded_16x32
+#define _OVERLOADED_INSTANTIATIONS "overload_insts_512.h"
+#elif _XTEAM_NUM_THREADS == 256
+#define _SUM_OVERLOAD_64_FCT sum_overloaded_4x64
+#define _SUM_OVERLOAD_32_FCT sum_overloaded_8x32
+#define _MAX_OVERLOAD_64_FCT max_overloaded_4x64
+#define _MAX_OVERLOAD_32_FCT max_overloaded_8x32
+#define _MIN_OVERLOAD_64_FCT min_overloaded_4x64
+#define _MIN_OVERLOAD_32_FCT min_overloaded_8x32
+#define _OVERLOADED_INSTANTIATIONS "overload_insts_256.h"
+#else
+#error Invalid value for _XTEAM_NUM_THREADS.  Must be 1024, 512, or 256
+#endif
+
+#ifndef _ARRAY_SIZE
+#define _ARRAY_SIZE 335544320
+#endif
+const uint64_t ARRAY_SIZE = _ARRAY_SIZE;
+unsigned int repeat_num_times = 12;
+unsigned int ignore_times = 2; // ignore this many timings first
+
+unsigned int test_run_rc = 0;
+
+template <typename T, bool> void run_tests(const uint64_t);
+template <typename TC, typename T> void run_tests_complex(const uint64_t);
+
+int main(int argc, char *argv[]) {
+  std::cout << std::endl
+            << "TEST DOUBLE " << _XTEAM_NUM_THREADS << " THREADS" << std::endl;
+  run_tests<double, false>(ARRAY_SIZE);
+  std::cout << std::endl
+            << "TEST FLOAT " << _XTEAM_NUM_THREADS << " THREADS" << std::endl;
+  run_tests<float, false>(ARRAY_SIZE);
+  std::cout << std::endl
+            << "TEST INT " << _XTEAM_NUM_THREADS << " THREADS" << std::endl;
+  run_tests<int, true>(ARRAY_SIZE);
+  std::cout << std::endl
+            << "TEST UNSIGNED INT " << _XTEAM_NUM_THREADS << " THREADS"
+            << std::endl;
+  run_tests<unsigned, true>(ARRAY_SIZE);
+  std::cout << std::endl
+            << "TEST LONG " << _XTEAM_NUM_THREADS << " THREADS " << std::endl;
+  run_tests<long, true>(ARRAY_SIZE);
+  std::cout << std::endl
+            << "TEST UNSIGNED LONG " << _XTEAM_NUM_THREADS << " THREADS"
+            << std::endl;
+  run_tests<unsigned long, true>(ARRAY_SIZE);
+#if 0
+  // FIXME: _Complex is hanging in last team logic of _xteam_reduction
+  //        Hint, sizeof(double _Complex) is 16 bytes
+  std::cout << std::endl << "TEST FLOAT COMPLEX "
+	  << _XTEAM_NUM_THREADS << " THREADS" << std::endl;
+  run_tests_complex<float _Complex, float>(ARRAY_SIZE);
+  std::cout << std::endl << "TEST DOUBLE COMPLEX "
+          << _XTEAM_NUM_THREADS << " THREADS" << std::endl;
+  run_tests_complex<double _Complex, double>(ARRAY_SIZE);
+#endif
+  return test_run_rc;
+}
+
+#include "xteamr_host_dev.h"
+#include _OVERLOADED_INSTANTIATIONS
+
+template <typename T> T omp_dot(T *a, T *b, uint64_t array_size) {
+  T sum = 0.0;
+#pragma omp target teams distribute parallel for map(tofrom: sum) reduction(+:sum)
+  for (int64_t i = 0; i < array_size; i++)
+    sum += a[i] * b[i];
+  return sum;
+}
+
+template <typename T> T omp_max(T *c, uint64_t array_size) {
+  T maxval = std::numeric_limits<T>::lowest();
+#pragma omp target teams distribute parallel for map(tofrom                    \
+                                                     : maxval)                 \
+    reduction(max                                                              \
+              : maxval)
+  for (int64_t i = 0; i < array_size; i++)
+    maxval = (c[i] > maxval) ? c[i] : maxval;
+  return maxval;
+}
+
+template <typename T> T omp_min(T *c, uint64_t array_size) {
+  T minval = std::numeric_limits<T>::max();
+#pragma omp target teams distribute parallel for map(tofrom                    \
+                                                     : minval)                 \
+    reduction(min                                                              \
+              : minval)
+  for (int64_t i = 0; i < array_size; i++) {
+    minval = (c[i] < minval) ? c[i] : minval;
+  }
+  return minval;
+}
+
+#define _INNER_LOOP                                                            \
+  for (int64_t i = ((k * LOOP_STRIDE) + LOOP_START); i < LOOP_SIZE;            \
+       i += (LOOP_TEAMS * _XTEAM_NUM_THREADS * LOOP_STRIDE))
+
+template <typename T>
+T sim_dot(T *a, T *b, uint64_t array_size, int warp_size) {
+  T sum = T(0);
+  int devid = 0;
+  static uint32_t *teams_done_ptr0 = nullptr;
+  static uint32_t *d_teams_done_ptr0;
+  static T *d_team_vals0;
+  static uint32_t team_procs0;
+  if (!teams_done_ptr0) {
+    // One-time alloc device array for each teams's reduction value.
+    team_procs0 = ompx_get_team_procs(devid);
+    d_team_vals0 = (T *)omp_target_alloc(sizeof(T) * team_procs0, devid);
+    // Allocate and copy the zero-initialized teams_done counter one time
+    // because it atomically resets when last team increments it.
+    teams_done_ptr0 = (uint32_t *)malloc(sizeof(uint32_t));
+    *teams_done_ptr0 = 0;
+    d_teams_done_ptr0 = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid);
+    omp_target_memcpy(d_teams_done_ptr0, teams_done_ptr0, sizeof(uint32_t), 0,
+                      0, devid, omp_get_initial_device());
+  }
+  // Making the array_size 64 bits avoids a data_submit and data_retrieve
+  const uint64_t LOOP_TEAMS = team_procs0;
+  const uint64_t LOOP_SIZE = (uint64_t)array_size;
+  if (warp_size == 64) {
+#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS)         \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom                                 \
+                                        : sum)                                 \
+        is_device_ptr(d_team_vals0, d_teams_done_ptr0)
+    for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val0 = T(0);
+      constexpr int LOOP_START = 0;
+      constexpr uint32_t LOOP_STRIDE = 1;
+      _INNER_LOOP { val0 += a[i] * b[i]; }
+      _SUM_OVERLOAD_64_FCT(val0, &sum, d_team_vals0, d_teams_done_ptr0, T(0));
+    }
+  } else {
+#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS)         \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom                                 \
+                                        : sum)                                 \
+        is_device_ptr(d_team_vals0, d_teams_done_ptr0)
+    for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val0 = T(0);
+      constexpr int LOOP_START = 0;
+      constexpr uint32_t LOOP_STRIDE = 1;
+      _INNER_LOOP { val0 += a[i] * b[i]; }
+      _SUM_OVERLOAD_32_FCT(val0, &sum, d_team_vals0, d_teams_done_ptr0, T(0));
+    }
+  }
+  return sum;
+}
+
+template <typename T> T sim_max(T *c, uint64_t array_size, int warp_size) {
+  int devid = 0;
+  const T minval = std::numeric_limits<T>::lowest();
+  T retval = minval;
+  static uint32_t *teams_done_ptr1 = nullptr;
+  static uint32_t *d_teams_done_ptr1;
+  static T *d_team_vals1;
+  static uint32_t team_procs1;
+  if (!teams_done_ptr1) {
+    // One-time alloc device array for each teams's reduction value.
+    team_procs1 = ompx_get_team_procs(devid);
+    d_team_vals1 = (T *)omp_target_alloc(sizeof(T) * team_procs1, devid);
+    // Allocate and copy the zero-initialized teams_done counter one time
+    // because it atomically resets when last team increments it.
+    // Clang can create a global initialized to 0 and remove the alloc and copy
+    teams_done_ptr1 = (uint32_t *)malloc(sizeof(uint32_t));
+    *teams_done_ptr1 = 0;
+    d_teams_done_ptr1 = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid);
+    omp_target_memcpy(d_teams_done_ptr1, teams_done_ptr1, sizeof(uint32_t), 0,
+                      0, devid, omp_get_initial_device());
+  }
+  // Making the array_size 64 bits somehow avoids a data_submit and
+  // data_retrieve.?
+  const uint64_t LOOP_TEAMS = team_procs1;
+  const uint64_t LOOP_SIZE = (uint64_t)array_size;
+  const T LOOP_INITVAL = minval;
+  if (warp_size == 64) {
+#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS)         \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom                                 \
+                                        : retval)                              \
+        is_device_ptr(d_team_vals1, d_teams_done_ptr1)
+    for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val1 = LOOP_INITVAL;
+      const int LOOP_START = 0;
+      const uint32_t LOOP_STRIDE = 1;
+      _INNER_LOOP { val1 = (c[i] > val1) ? c[i] : val1; }
+      // we do not have device numeric_limits so use host numeric_limits
+      // clang codegen should embed a constant for each datatype to pass to
+      // xteamr
+      _MAX_OVERLOAD_64_FCT(val1, &retval, d_team_vals1, d_teams_done_ptr1,
+                           LOOP_INITVAL);
+    }
+  } else {
+#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS)         \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom                                 \
+                                        : retval)                              \
+        is_device_ptr(d_team_vals1, d_teams_done_ptr1)
+    for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val1 = LOOP_INITVAL;
+      const int LOOP_START = 0;
+      const uint32_t LOOP_STRIDE = 1;
+      _INNER_LOOP { val1 = (c[i] > val1) ? c[i] : val1; }
+      _MAX_OVERLOAD_32_FCT(val1, &retval, d_team_vals1, d_teams_done_ptr1,
+                           LOOP_INITVAL);
+    }
+  }
+  return retval;
+}
+
+template <typename T> T sim_min(T *c, uint64_t array_size, int warp_size) {
+  int devid = 0;
+  const T maxval = std::numeric_limits<T>::max();
+  T retval = maxval;
+  static uint32_t *teams_done_ptr2;
+  static uint32_t *d_teams_done_ptr2;
+  static T *d_team_vals2;
+  static uint32_t team_procs2;
+  if (!teams_done_ptr2) {
+    // One-time alloc device array for each teams's reduction value.
+    team_procs2 = ompx_get_team_procs(devid);
+    d_team_vals2 = (T *)omp_target_alloc(sizeof(T) * team_procs2, devid);
+    // Allocate and copy the zero-initialized teams_done counter one time
+    // because it atomically resets when last team increments it.
+    // Clang can create a global initialized to 0 and remove the alloc and copy
+    teams_done_ptr2 = (uint32_t *)malloc(sizeof(uint32_t));
+    *teams_done_ptr2 = 0;
+    d_teams_done_ptr2 = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid);
+    omp_target_memcpy(d_teams_done_ptr2, teams_done_ptr2, sizeof(uint32_t), 0,
+                      0, devid, omp_get_initial_device());
+  }
+  // Making the array_size 64 bits avoids a data_submit and data_retrieve.
+  const uint64_t LOOP_TEAMS = team_procs2;
+  const uint64_t LOOP_SIZE = (uint64_t)array_size;
+  const T LOOP_INITVAL = maxval;
+  if (warp_size == 64) {
+#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS)         \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom                                 \
+                                        : retval)                              \
+        is_device_ptr(d_team_vals2, d_teams_done_ptr2)
+    for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val2 = LOOP_INITVAL;
+      const int LOOP_START = 0;
+      const uint32_t LOOP_STRIDE = 1;
+      _INNER_LOOP { val2 = (c[i] < val2) ? c[i] : val2; }
+      _MIN_OVERLOAD_64_FCT(val2, &retval, d_team_vals2, d_teams_done_ptr2,
+                           LOOP_INITVAL);
+    }
+  } else {
+#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS)         \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom                                 \
+                                        : retval)                              \
+        is_device_ptr(d_team_vals2, d_teams_done_ptr2)
+    for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val2 = LOOP_INITVAL;
+      const int LOOP_START = 0;
+      const uint32_t LOOP_STRIDE = 1;
+      _INNER_LOOP { val2 = (c[i] < val2) ? c[i] : val2; }
+      _MIN_OVERLOAD_32_FCT(val2, &retval, d_team_vals2, d_teams_done_ptr2,
+                           LOOP_INITVAL);
+    }
+  }
+  return retval;
+}
+
+template <typename T, bool DATA_TYPE_IS_INT>
+void _check_val(T computed_val, T gold_val, const char *msg) {
+  double ETOL = 0.0000001;
+  if (DATA_TYPE_IS_INT) {
+    if (computed_val != gold_val) {
+      std::cerr << msg << " FAIL "
+                << "Integar Value was " << computed_val << " but should be "
+                << gold_val << std::endl;
+      test_run_rc = 1;
+    }
+  } else {
+    double dcomputed_val = (double)computed_val;
+    double dvalgold = (double)gold_val;
+    double ompErrSum = abs((dcomputed_val - dvalgold) / dvalgold);
+    if (ompErrSum > ETOL) {
+      std::cerr << msg << " FAIL " << ompErrSum << " tol:" << ETOL << std::endl
+                << std::setprecision(15) << "Value was " << computed_val
+                << " but should be " << gold_val << std::endl;
+      test_run_rc = 1;
+    }
+  }
+}
+
+template <typename T, bool DATA_TYPE_IS_INT>
+void run_tests(const uint64_t array_size) {
+
+  // FIXME: How do we get warpsize of a device from host?
+  int warp_size;
+#pragma omp target map(tofrom : warp_size)
+  warp_size = __kmpc_get_warp_size();
+
+  //  Align on 2M boundaries
+  T *a = (T *)aligned_alloc(2 * 1024 * 1024, sizeof(T) * array_size);
+  T *b = (T *)aligned_alloc(2 * 1024 * 1024, sizeof(T) * array_size);
+  T *c = (T *)aligned_alloc(2 * 1024 * 1024, sizeof(T) * array_size);
+#pragma omp target enter data map(alloc                                        \
+                                  : a [0:array_size], b [0:array_size],        \
+                                    c [0:array_size])
+#pragma omp target teams distribute parallel for
+  for (int64_t i = 0; i < array_size; i++) {
+    a[i] = 2;
+    b[i] = 3;
+    c[i] = (i + 1);
+  }
+
+  std::cout << "Running kernels " << repeat_num_times << " times" << std::endl;
+  std::cout << "Ignoring timing of first " << ignore_times << "  runs "
+            << std::endl;
+
+  double ETOL = 0.0000001;
+  if (DATA_TYPE_IS_INT) {
+    std::cout << "Integer Size: " << sizeof(T) << std::endl;
+  } else {
+    if (sizeof(T) == sizeof(float))
+      std::cout << "Precision: float" << std::endl;
+    else
+      std::cout << "Precision: double" << std::endl;
+  }
+
+  int team_procs = ompx_get_team_procs(omp_get_default_device());
+  std::cout << "Warp size:" << warp_size << std::endl;
+  std::cout << "Team processors:" << team_procs << std::endl;
+  std::cout << "Array elements: " << array_size << std::endl;
+  std::cout << "Array size:     " << ((array_size * sizeof(T)) / (1024 * 1024))
+            << " MB" << std::endl;
+
+  T goldDot = (T)6 * (T)array_size;
+  T goldMax = (T)array_size;
+  T goldMin = (T)1;
+
+  double goldDot_d = (double)goldDot;
+  double goldMax_d = (double)goldMax;
+  double goldMin_d = (double)goldMin;
+
+  // List of times
+  std::vector<std::vector<double>> timings(6);
+
+  // Declare timers
+  std::chrono::high_resolution_clock::time_point t1, t2;
+
+  // Timing loop
+  for (unsigned int k = 0; k < repeat_num_times; k++) {
+    t1 = std::chrono::high_resolution_clock::now();
+    T omp_sum = omp_dot<T>(a, b, array_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[0].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val<T, DATA_TYPE_IS_INT>(omp_sum, goldDot, "omp_dot");
+
+    t1 = std::chrono::high_resolution_clock::now();
+    T sim_sum = sim_dot<T>(a, b, array_size, warp_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[1].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val<T, DATA_TYPE_IS_INT>(sim_sum, goldDot, "sim_dot");
+
+    t1 = std::chrono::high_resolution_clock::now();
+    T omp_max_val = omp_max<T>(c, array_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[2].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val<T, DATA_TYPE_IS_INT>(omp_max_val, goldMax, "omp_max");
+
+    t1 = std::chrono::high_resolution_clock::now();
+    T sim_max_val = sim_max<T>(c, array_size, warp_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[3].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val<T, DATA_TYPE_IS_INT>(sim_max_val, goldMax, "sim_max");
+
+    t1 = std::chrono::high_resolution_clock::now();
+    T omp_min_val = omp_min<T>(c, array_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[4].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val<T, DATA_TYPE_IS_INT>(omp_min_val, goldMin, "omp_min");
+
+    t1 = std::chrono::high_resolution_clock::now();
+    T sim_min_val = sim_min<T>(c, array_size, warp_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[5].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val<T, DATA_TYPE_IS_INT>(sim_min_val, goldMin, "sim_min");
+
+  } // end Timing loop
+
+  // Display timing results
+  std::cout << std::left << std::setw(12) << "Function" << std::left
+            << std::setw(12) << "Best-MB/sec" << std::left << std::setw(12)
+            << " Min (sec)" << std::left << std::setw(12) << "   Max"
+            << std::left << std::setw(12) << "Average" << std::left
+            << std::setw(12) << "Avg-MB/sec" << std::endl;
+
+  std::cout << std::fixed;
+
+  std::string labels[6] = {"ompdot", "simdot", "ompmax",
+                           "simmax", "ompmin", "simmin"};
+  size_t sizes[6] = {2 * sizeof(T) * array_size, 2 * sizeof(T) * array_size,
+                     1 * sizeof(T) * array_size, 1 * sizeof(T) * array_size,
+                     1 * sizeof(T) * array_size, 1 * sizeof(T) * array_size};
+
+  for (int i = 0; i < 6; i++) {
+    // Get min/max; ignore the first couple results
+    auto minmax = std::minmax_element(timings[i].begin() + ignore_times,
+                                      timings[i].end());
+
+    // Calculate average; ignore ignore_times
+    double average = std::accumulate(timings[i].begin() + ignore_times,
+                                     timings[i].end(), 0.0) /
+                     (double)(repeat_num_times - ignore_times);
+
+    printf("  %s       %8.0f   %8.6f  %8.6f   %8.6f    %8.0f\n",
+           labels[i].c_str(), 1.0E-6 * sizes[i] / (*minmax.first),
+           (double)*minmax.first, (double)*minmax.second, (double)average,
+           1.0E-6 * sizes[i] / (average));
+  }
+#pragma omp target exit data map(release                                       \
+                                 : a [0:array_size], b [0:array_size],         \
+                                   c [0:array_size])
+  free(a);
+  free(b);
+  free(c);
+}
+
+/// ======================== START COMPLEX TESTING ===========================
+
+template <typename TC, typename T>
+void _check_val_complex(TC computed_val_complex, TC gold_val_complex,
+                        const char *msg) {
+  T gold_val_r = __real__(gold_val_complex);
+  T computed_val_r = __real__(computed_val_complex);
+  T gold_val_i = __imag__(gold_val_complex);
+  T computed_val_i = __imag__(computed_val_complex);
+  double ETOL = 0.0000001;
+  double computed_val_r_d = (double)computed_val_r;
+  double valgold_r_d = (double)gold_val_r;
+  double ompErrSum_r = abs((computed_val_r_d - valgold_r_d) / valgold_r_d);
+  double computed_val_i_d = (double)computed_val_i;
+  double valgold_i_d = (double)gold_val_i;
+  double ompErrSum_i = abs((computed_val_i_d - valgold_i_d) / valgold_i_d);
+  if ((ompErrSum_r > ETOL) || (ompErrSum_i > ETOL)) {
+    std::cerr << msg << " FAIL " << ompErrSum_r << " tol:" << ETOL << std::endl
+              << std::setprecision(15) << "Value was (" << computed_val_r
+              << " + " << computed_val_i << " i )" << std::endl
+              << " but should be (" << gold_val_r << " + " << gold_val_i
+              << "i) " << std::endl;
+    test_run_rc = 1;
+  }
+}
+
+template <typename TC> TC omp_dot_complex(TC *a, TC *b, uint64_t array_size) {
+  TC dot;
+  __real__(dot) = 0.0;
+  __imag__(dot) = 0.0;
+#pragma omp target teams distribute parallel for map(tofrom: dot) reduction(+:dot)
+  for (int64_t i = 0; i < array_size; i++)
+    dot += a[i] * b[i];
+  return dot;
+}
+
+template <typename TC>
+TC sim_dot_complex(TC *a, TC *b, uint64_t array_size, int warp_size) {
+  TC zero_c;
+  __real__(zero_c) = 0.0;
+  __imag__(zero_c) = 0.0;
+  TC dot = zero_c;
+  int devid = 0;
+  static uint32_t *teams_done_ptr00 = nullptr;
+  static uint32_t *d_teams_done_ptr00;
+  static TC *d_team_vals00;
+  static uint32_t team_procs00;
+  if (!teams_done_ptr00) {
+    // One-time alloc device array for each teams's reduction value.
+    team_procs00 = ompx_get_team_procs(devid);
+    d_team_vals00 = (TC *)omp_target_alloc(sizeof(TC) * team_procs00, devid);
+    // Allocate and copy the zero-initialized teams_done counter one time
+    // because it atomically resets when last team increments it.
+    teams_done_ptr00 = (uint32_t *)malloc(sizeof(uint32_t));
+    *teams_done_ptr00 = 0;
+    d_teams_done_ptr00 = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid);
+    omp_target_memcpy(d_teams_done_ptr00, teams_done_ptr00, sizeof(uint32_t), 0,
+                      0, devid, omp_get_initial_device());
+  }
+  // Making the array_size 64 bits avoids a data_submit and data_retrieve
+  const uint64_t LOOP_TEAMS = team_procs00;
+  const uint64_t LOOP_SIZE = (uint64_t)array_size;
+  if (warp_size == 64) {
+#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS)         \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom                                 \
+                                        : dot)                                 \
+        is_device_ptr(d_team_vals00, d_teams_done_ptr00)
+    for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      TC val00 = zero_c;
+      constexpr int LOOP_START = 0;
+      constexpr uint32_t LOOP_STRIDE = 1;
+      _INNER_LOOP { val00 += a[i] * b[i]; }
+      _SUM_OVERLOAD_64_FCT(val00, &dot, d_team_vals00, d_teams_done_ptr00,
+                           zero_c);
+    }
+  } else {
+#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS)         \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom                                 \
+                                        : dot)                                 \
+        is_device_ptr(d_team_vals00, d_teams_done_ptr00)
+    for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      TC val00 = zero_c;
+      constexpr int LOOP_START = 0;
+      constexpr uint32_t LOOP_STRIDE = 1;
+      _INNER_LOOP { val00 += a[i] * b[i]; }
+      _SUM_OVERLOAD_32_FCT(val00, &dot, d_team_vals00, d_teams_done_ptr00,
+                           zero_c);
+    }
+  }
+  return dot;
+}
+
+template <typename TC, typename T>
+void run_tests_complex(const uint64_t array_size) {
+
+  // FIXME: Is there a way to get warpsize of device from host?
+  //        clang Codegen knows warpsize from GPU type.
+  int warp_size;
+#pragma omp target map(tofrom : warp_size)
+  warp_size = __kmpc_get_warp_size();
+
+  TC *a = (TC *)aligned_alloc(2 * 1024 * 1024, sizeof(TC) * array_size);
+  TC *b = (TC *)aligned_alloc(2 * 1024 * 1024, sizeof(TC) * array_size);
+
+#pragma omp target enter data map(alloc : a [0:array_size], b [0:array_size])
+  TC startA;
+  __real__(startA) = 1.0;
+  __imag__(startA) = 1.0;
+  TC startB;
+  __real__(startB) = 1.0;
+  __imag__(startB) = -1.0;
+
+#pragma omp target teams distribute parallel for
+  for (int64_t i = 0; i < array_size; i++) {
+    a[i] = startA;
+    b[i] = startB;
+    // a[i] * b[i] = 2 + 0i
+  }
+
+  std::cout << "Running kernels " << repeat_num_times << " times" << std::endl;
+  std::cout << "Ignoring timing of first " << ignore_times << "  runs "
+            << std::endl;
+
+  double ETOL = 0.0000001;
+  if (sizeof(TC) == sizeof(float _Complex))
+    std::cout << "Precision: float _Complex" << std::endl;
+  else
+    std::cout << "Precision: double _Complex" << std::endl;
+
+  int team_procs = ompx_get_team_procs(omp_get_default_device());
+  std::cout << "Warp size:" << warp_size << std::endl;
+  std::cout << "Team processors:" << team_procs << std::endl;
+  std::cout << "Array elements: " << array_size << std::endl;
+  std::cout << "Array size:     " << ((array_size * sizeof(TC)) / (1024 * 1024))
+            << " MB" << std::endl;
+
+  T goldDotr = T(2) * (T)array_size;
+  T goldDoti = T(0);
+
+  TC goldDot;
+  __real__(goldDot) = goldDotr;
+  __imag__(goldDot) = goldDoti;
+
+  // List of times
+  std::vector<std::vector<double>> timings(2);
+
+  // Declare timers
+  std::chrono::high_resolution_clock::time_point t1, t2;
+
+  // timing loop
+  for (unsigned int k = 0; k < repeat_num_times; k++) {
+    t1 = std::chrono::high_resolution_clock::now();
+    TC omp_sum = omp_dot_complex<TC>(a, b, array_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[0].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val_complex<TC, T>(omp_sum, goldDot, "omp_dot");
+
+    t1 = std::chrono::high_resolution_clock::now();
+    TC sim_sum = sim_dot_complex<TC>(a, b, array_size, warp_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[1].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val_complex<TC, T>(sim_sum, goldDot, "sim_dot");
+
+  } // end timing loop
+
+  // Display timing results
+  std::cout << std::left << std::setw(12) << "Function" << std::left
+            << std::setw(12) << "Best-MB/sec" << std::left << std::setw(12)
+            << " Min (sec)" << std::left << std::setw(12) << "   Max"
+            << std::left << std::setw(12) << "Average" << std::left
+            << std::setw(12) << "Avg-MB/sec" << std::endl;
+
+  std::cout << std::fixed;
+
+  std::string labels[2] = {"ompdot", "simdot"};
+  size_t sizes[2] = {2 * sizeof(TC) * array_size, 2 * sizeof(TC) * array_size};
+
+  for (int i = 0; i < 2; i++) {
+    // Get min/max; ignore the first couple results
+    auto minmax = std::minmax_element(timings[i].begin() + ignore_times,
+                                      timings[i].end());
+
+    // Calculate average; ignore ignore_times
+    double average = std::accumulate(timings[i].begin() + ignore_times,
+                                     timings[i].end(), 0.0) /
+                     (double)(repeat_num_times - ignore_times);
+
+    printf("  %s       %8.0f   %8.6f  %8.6f   %8.6f    %8.0f\n",
+           labels[i].c_str(), 1.0E-6 * sizes[i] / (*minmax.first),
+           (double)*minmax.first, (double)*minmax.second, (double)average,
+           1.0E-6 * sizes[i] / (average));
+  }
+#pragma omp target exit data map(release : a [0:array_size], b [0:array_size])
+  free(a);
+  free(b);
+}
diff --git a/openmp/libomptarget/test/xteamr/xteamr_host_dev.h b/openmp/libomptarget/test/xteamr/xteamr_host_dev.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/xteamr/xteamr_host_dev.h
@@ -0,0 +1,638 @@
+//===---- xteamr_host_dev.h - Device declares and host defs for xteamr --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Cannot include Interface.h for this test, so this is a repeat of headers
+// for Xteamr and Rfuns from that file. This file also contains the host
+// dummy functions needed to link the test.
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__AMDGCN__) || defined(__NVPTX__)
+extern "C" {
+
+///
+///  Helper functions for high performance cross team reductions.
+///
+///    Name decoder:  __kmpc_xteamr_<dtype>_<waves>x<warpsize>
+///       <waves> number of warps/waves in team
+///       <warpsize> 32 or 64
+///       example: __kmpc_xteam_f_16x64 is the helper function for data
+///                type float with fixed teamsize of 1024 threads.
+///    There are 48(6 X 8) helper functions.
+///    6 configurations of teamsize are currently created.
+///    The most performant configs use max teamsize 1024: 16x64 and 32x32.
+///    Currently the Other configs are 8x64, 4x64, 16x32, and 8x32.
+///    8 data types available: double, float, int, uint, long, ulong,
+///    double  _Complex, and float _complex
+///
+///    All helper functions have these 7 args:
+///      arg1: the thread local reduction value
+///      arg2: pointer to where result is written
+///      arg3: global array of team values for this reduction instance
+///      arg4: atomic counter of completed teams for this reduction instance
+///      arg5: void function pointer to pair reduction function,
+///            (e.g. sum(&a,b),min(&a,b),max(&a,b)
+///      arg6: equivalent (to arg5) void function pointer of pair reduction
+///            function on LDS memory, (e.g. sum(&a,&b),min(&a,&b),max(&a,&b)
+///      arg7: Initializing value for the reduction type
+///
+#define _RF_LDS volatile __attribute__((address_space(3)))
+void __kmpc_xteamr_d_16x64(double v, double *r_ptr, double *tvals,
+                           uint32_t *td_ptr, void (*_rf)(double *, double),
+                           void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                           const double iv);
+void __kmpc_xteamr_f_16x64(float v, float *r_ptr, float *tvals,
+                           uint32_t *td_ptr, void (*_rf)(float *, float),
+                           void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                           const float iv);
+void __kmpc_xteamr_cd_16x64(double _Complex v, double _Complex *r_ptr,
+                            double _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(double _Complex *, double _Complex),
+                            void (*_rf_lds)(_RF_LDS double _Complex *,
+                                            _RF_LDS double _Complex *),
+                            const double _Complex iv);
+void __kmpc_xteamr_cf_16x64(float _Complex v, float _Complex *r_ptr,
+                            float _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(float _Complex *, float _Complex),
+                            void (*_rf_lds)(_RF_LDS float _Complex *,
+                                            _RF_LDS float _Complex *),
+                            const float _Complex iv);
+void __kmpc_xteamr_i_16x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                           void (*_rf)(int *, int),
+                           void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                           const int iv);
+void __kmpc_xteamr_ui_16x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                            void (*_rf_lds)(_RF_LDS uint32_t *,
+                                            _RF_LDS uint32_t *),
+                            const uint32_t iv);
+void __kmpc_xteamr_l_16x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                           void (*_rf)(long *, long),
+                           void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                           const long iv);
+void __kmpc_xteamr_ul_16x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                            void (*_rf_lds)(_RF_LDS uint64_t *,
+                                            _RF_LDS uint64_t *),
+                            const uint64_t iv);
+void __kmpc_xteamr_d_8x64(double v, double *r_ptr, double *tvals,
+                          uint32_t *td_ptr, void (*_rf)(double *, double),
+                          void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                          const double iv);
+void __kmpc_xteamr_f_8x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                          void (*_rf)(float *, float),
+                          void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                          const float iv);
+void __kmpc_xteamr_cd_8x64(double _Complex v, double _Complex *r_ptr,
+                           double _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(double _Complex *, double _Complex),
+                           void (*_rf_lds)(_RF_LDS double _Complex *,
+                                           _RF_LDS double _Complex *),
+                           const double _Complex iv);
+void __kmpc_xteamr_cf_8x64(float _Complex v, float _Complex *r_ptr,
+                           float _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(float _Complex *, float _Complex),
+                           void (*_rf_lds)(_RF_LDS float _Complex *,
+                                           _RF_LDS float _Complex *),
+                           const float _Complex iv);
+void __kmpc_xteamr_i_8x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                          void (*_rf)(int *, int),
+                          void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                          const int iv);
+void __kmpc_xteamr_ui_8x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                           void (*_rf_lds)(_RF_LDS uint32_t *,
+                                           _RF_LDS uint32_t *),
+                           const uint32_t iv);
+void __kmpc_xteamr_l_8x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                          void (*_rf)(long *, long),
+                          void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                          const long iv);
+void __kmpc_xteamr_ul_8x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                           void (*_rf_lds)(_RF_LDS uint64_t *,
+                                           _RF_LDS uint64_t *),
+                           const uint64_t iv);
+void __kmpc_xteamr_d_4x64(double v, double *r_ptr, double *tvals,
+                          uint32_t *td_ptr, void (*_rf)(double *, double),
+                          void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                          const double iv);
+void __kmpc_xteamr_f_4x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                          void (*_rf)(float *, float),
+                          void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                          const float iv);
+void __kmpc_xteamr_cd_4x64(double _Complex v, double _Complex *r_ptr,
+                           double _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(double _Complex *, double _Complex),
+                           void (*_rf_lds)(_RF_LDS double _Complex *,
+                                           _RF_LDS double _Complex *),
+                           const double _Complex iv);
+void __kmpc_xteamr_cf_4x64(float _Complex v, float _Complex *r_ptr,
+                           float _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(float _Complex *, float _Complex),
+                           void (*_rf_lds)(_RF_LDS float _Complex *,
+                                           _RF_LDS float _Complex *),
+                           const float _Complex iv);
+void __kmpc_xteamr_i_4x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                          void (*_rf)(int *, int),
+                          void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                          const int iv);
+void __kmpc_xteamr_ui_4x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                           void (*_rf_lds)(_RF_LDS uint32_t *,
+                                           _RF_LDS uint32_t *),
+                           const uint32_t iv);
+void __kmpc_xteamr_l_4x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                          void (*_rf)(long *, long),
+                          void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                          const long iv);
+void __kmpc_xteamr_ul_4x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                           void (*_rf_lds)(_RF_LDS uint64_t *,
+                                           _RF_LDS uint64_t *),
+                           const uint64_t iv);
+void __kmpc_xteamr_d_32x32(double v, double *r_ptr, double *tvals,
+                           uint32_t *td_ptr, void (*_rf)(double *, double),
+                           void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                           const double iv);
+void __kmpc_xteamr_f_32x32(float v, float *r_ptr, float *tvals,
+                           uint32_t *td_ptr, void (*_rf)(float *, float),
+                           void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                           const float iv);
+void __kmpc_xteamr_cd_32x32(double _Complex v, double _Complex *r_ptr,
+                            double _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(double _Complex *, double _Complex),
+                            void (*_rf_lds)(_RF_LDS double _Complex *,
+                                            _RF_LDS double _Complex *),
+                            const double _Complex iv);
+void __kmpc_xteamr_cf_32x32(float _Complex v, float _Complex *r_ptr,
+                            float _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(float _Complex *, float _Complex),
+                            void (*_rf_lds)(_RF_LDS float _Complex *,
+                                            _RF_LDS float _Complex *),
+                            const float _Complex iv);
+
+void __kmpc_xteamr_i_32x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                           void (*_rf)(int *, int),
+                           void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                           const int iv);
+
+void __kmpc_xteamr_ui_32x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                            void (*_rf_lds)(_RF_LDS uint32_t *,
+                                            _RF_LDS uint32_t *),
+                            const uint32_t iv);
+void __kmpc_xteamr_l_32x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                           void (*_rf)(long *, long),
+                           void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                           const long iv);
+void __kmpc_xteamr_ul_32x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                            void (*_rf_lds)(_RF_LDS uint64_t *,
+                                            _RF_LDS uint64_t *),
+                            const uint64_t iv);
+void __kmpc_xteamr_d_16x32(double v, double *r_ptr, double *tvals,
+                           uint32_t *td_ptr, void (*_rf)(double *, double),
+                           void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                           const double iv);
+void __kmpc_xteamr_f_16x32(float v, float *r_ptr, float *tvals,
+                           uint32_t *td_ptr, void (*_rf)(float *, float),
+                           void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                           const float iv);
+void __kmpc_xteamr_cd_16x32(double _Complex v, double _Complex *r_ptr,
+                            double _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(double _Complex *, double _Complex),
+                            void (*_rf_lds)(_RF_LDS double _Complex *,
+                                            _RF_LDS double _Complex *),
+                            const double _Complex iv);
+void __kmpc_xteamr_cf_16x32(float _Complex v, float _Complex *r_ptr,
+                            float _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(float _Complex *, float _Complex),
+                            void (*_rf_lds)(_RF_LDS float _Complex *,
+                                            _RF_LDS float _Complex *),
+                            const float _Complex iv);
+
+void __kmpc_xteamr_i_16x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                           void (*_rf)(int *, int),
+                           void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                           const int iv);
+void __kmpc_xteamr_ui_16x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                            void (*_rf_lds)(_RF_LDS uint32_t *,
+                                            _RF_LDS uint32_t *),
+                            const uint32_t iv);
+void __kmpc_xteamr_l_16x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                           void (*_rf)(long *, long),
+                           void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                           const long iv);
+void __kmpc_xteamr_ul_16x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                            void (*_rf_lds)(_RF_LDS uint64_t *,
+                                            _RF_LDS uint64_t *),
+                            const uint64_t iv);
+void __kmpc_xteamr_d_8x32(double v, double *r_ptr, double *tvals,
+                          uint32_t *td_ptr, void (*_rf)(double *, double),
+                          void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                          const double iv);
+void __kmpc_xteamr_f_8x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                          void (*_rf)(float *, float),
+                          void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                          const float iv);
+void __kmpc_xteamr_cd_8x32(double _Complex v, double _Complex *r_ptr,
+                           double _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(double _Complex *, double _Complex),
+                           void (*_rf_lds)(_RF_LDS double _Complex *,
+                                           _RF_LDS double _Complex *),
+                           const double _Complex iv);
+void __kmpc_xteamr_cf_8x32(float _Complex v, float _Complex *r_ptr,
+                           float _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(float _Complex *, float _Complex),
+                           void (*_rf_lds)(_RF_LDS float _Complex *,
+                                           _RF_LDS float _Complex *),
+                           const float _Complex iv);
+void __kmpc_xteamr_i_8x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                          void (*_rf)(int *, int),
+                          void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                          const int iv);
+void __kmpc_xteamr_ui_8x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                           void (*_rf_lds)(_RF_LDS uint32_t *,
+                                           _RF_LDS uint32_t *),
+                           const uint32_t iv);
+void __kmpc_xteamr_l_8x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                          void (*_rf)(long *, long),
+                          void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                          const long iv);
+void __kmpc_xteamr_ul_8x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                           void (*_rf_lds)(_RF_LDS uint64_t *,
+                                           _RF_LDS uint64_t *),
+                           const uint64_t iv);
+
+///  Builtin pair reduction functions.
+///    These become function pointers for arg5 and arg6 of xteamr above.
+///    Each pair reduction function must have two variants to support xteamr.
+///    The 1st is for TLS memory and the 2nd is for LDS (scratchpad) memory.
+///    These are defined in Rfuns.cpp.  User defined reductions require
+///    that Clang codegen generate these functions.
+void __kmpc_rfun_sum_d(double *val, double otherval);
+void __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval);
+void __kmpc_rfun_sum_f(float *val, float otherval);
+void __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval);
+void __kmpc_rfun_sum_cd(double _Complex *val, double _Complex otherval);
+void __kmpc_rfun_sum_lds_cd(_RF_LDS double _Complex *val,
+                            _RF_LDS double _Complex *otherval);
+void __kmpc_rfun_sum_cf(float _Complex *val, float _Complex otherval);
+void __kmpc_rfun_sum_lds_cf(_RF_LDS float _Complex *val,
+                            _RF_LDS float _Complex *otherval);
+void __kmpc_rfun_sum_i(int *val, int otherval);
+void __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval);
+void __kmpc_rfun_sum_ui(unsigned int *val, unsigned int otherval);
+void __kmpc_rfun_sum_lds_ui(_RF_LDS unsigned int *val,
+                            _RF_LDS unsigned int *otherval);
+void __kmpc_rfun_sum_l(long *val, long otherval);
+void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
+void __kmpc_rfun_sum_ul(unsigned long *val, unsigned long otherval);
+void __kmpc_rfun_sum_lds_ul(_RF_LDS unsigned long *val,
+                            _RF_LDS unsigned long *otherval);
+/// Complex variables have no compare, so no min or max for cf and cd.
+void __kmpc_rfun_min_d(double *val, double otherval);
+void __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval);
+void __kmpc_rfun_min_f(float *val, float otherval);
+void __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval);
+void __kmpc_rfun_min_i(int *val, int otherval);
+void __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval);
+void __kmpc_rfun_min_ui(unsigned int *val, unsigned int otherval);
+void __kmpc_rfun_min_lds_ui(_RF_LDS unsigned int *val,
+                            _RF_LDS unsigned int *otherval);
+void __kmpc_rfun_min_l(long *val, long otherval);
+void __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
+void __kmpc_rfun_min_ul(unsigned long *val, unsigned long otherval);
+void __kmpc_rfun_min_lds_ul(_RF_LDS unsigned long *val,
+                            _RF_LDS unsigned long *otherval);
+void __kmpc_rfun_max_d(double *val, double otherval);
+void __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval);
+void __kmpc_rfun_max_f(float *val, float otherval);
+void __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval);
+void __kmpc_rfun_max_i(int *val, int otherval);
+void __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval);
+void __kmpc_rfun_max_ui(unsigned int *val, unsigned int otherval);
+void __kmpc_rfun_max_lds_ui(_RF_LDS unsigned int *val,
+                            _RF_LDS unsigned int *otherval);
+void __kmpc_rfun_max_l(long *val, long otherval);
+void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
+void __kmpc_rfun_max_ul(unsigned long *val, unsigned long otherval);
+void __kmpc_rfun_max_lds_ul(_RF_LDS unsigned long *val,
+                            _RF_LDS unsigned long *otherval);
+#undef _RF_LDS
+
+int __kmpc_get_warp_size();
+
+} // end of extern "C"
+
+#else // end of device compilation
+
+// For host compilation, define null functions for host linking
+
+extern "C" {
+#define _RF_LDS
+void __kmpc_xteamr_d_16x64(double v, double *r_ptr, double *tvals,
+                           uint32_t *td_ptr, void (*_rf)(double *, double),
+                           void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                           const double iv) {}
+void __kmpc_xteamr_f_16x64(float v, float *r_ptr, float *tvals,
+                           uint32_t *td_ptr, void (*_rf)(float *, float),
+                           void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                           const float iv) {}
+void __kmpc_xteamr_cd_16x64(double _Complex v, double _Complex *r_ptr,
+                            double _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(double _Complex *, double _Complex),
+                            void (*_rf_lds)(_RF_LDS double _Complex *,
+                                            _RF_LDS double _Complex *),
+                            const double _Complex iv) {}
+void __kmpc_xteamr_cf_16x64(float _Complex v, float _Complex *r_ptr,
+                            float _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(float _Complex *, float _Complex),
+                            void (*_rf_lds)(_RF_LDS float _Complex *,
+                                            _RF_LDS float _Complex *),
+                            const float _Complex iv) {}
+void __kmpc_xteamr_i_16x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                           void (*_rf)(int *, int),
+                           void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                           const int iv) {}
+void __kmpc_xteamr_ui_16x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                            void (*_rf_lds)(_RF_LDS uint32_t *,
+                                            _RF_LDS uint32_t *),
+                            const uint32_t iv) {}
+void __kmpc_xteamr_l_16x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                           void (*_rf)(long *, long),
+                           void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                           const long iv) {}
+void __kmpc_xteamr_ul_16x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                            void (*_rf_lds)(_RF_LDS uint64_t *,
+                                            _RF_LDS uint64_t *),
+                            const uint64_t iv) {}
+void __kmpc_xteamr_d_8x64(double v, double *r_ptr, double *tvals,
+                          uint32_t *td_ptr, void (*_rf)(double *, double),
+                          void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                          const double iv) {}
+void __kmpc_xteamr_f_8x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                          void (*_rf)(float *, float),
+                          void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                          const float iv) {}
+void __kmpc_xteamr_cd_8x64(double _Complex v, double _Complex *r_ptr,
+                           double _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(double _Complex *, double _Complex),
+                           void (*_rf_lds)(_RF_LDS double _Complex *,
+                                           _RF_LDS double _Complex *),
+                           const double _Complex iv) {}
+void __kmpc_xteamr_cf_8x64(float _Complex v, float _Complex *r_ptr,
+                           float _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(float _Complex *, float _Complex),
+                           void (*_rf_lds)(_RF_LDS float _Complex *,
+                                           _RF_LDS float _Complex *),
+                           const float _Complex iv) {}
+void __kmpc_xteamr_i_8x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                          void (*_rf)(int *, int),
+                          void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                          const int iv) {}
+void __kmpc_xteamr_ui_8x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                           void (*_rf_lds)(_RF_LDS uint32_t *,
+                                           _RF_LDS uint32_t *),
+                           const uint32_t iv) {}
+void __kmpc_xteamr_l_8x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                          void (*_rf)(long *, long),
+                          void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                          const long iv) {}
+void __kmpc_xteamr_ul_8x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                           void (*_rf_lds)(_RF_LDS uint64_t *,
+                                           _RF_LDS uint64_t *),
+                           const uint64_t iv) {}
+void __kmpc_xteamr_d_4x64(double v, double *r_ptr, double *tvals,
+                          uint32_t *td_ptr, void (*_rf)(double *, double),
+                          void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                          const double iv) {}
+void __kmpc_xteamr_f_4x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                          void (*_rf)(float *, float),
+                          void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                          const float iv) {}
+void __kmpc_xteamr_cd_4x64(double _Complex v, double _Complex *r_ptr,
+                           double _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(double _Complex *, double _Complex),
+                           void (*_rf_lds)(_RF_LDS double _Complex *,
+                                           _RF_LDS double _Complex *),
+                           const double _Complex iv) {}
+void __kmpc_xteamr_cf_4x64(float _Complex v, float _Complex *r_ptr,
+                           float _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(float _Complex *, float _Complex),
+                           void (*_rf_lds)(_RF_LDS float _Complex *,
+                                           _RF_LDS float _Complex *),
+                           const float _Complex iv) {}
+void __kmpc_xteamr_i_4x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                          void (*_rf)(int *, int),
+                          void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                          const int iv) {}
+void __kmpc_xteamr_ui_4x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                           void (*_rf_lds)(_RF_LDS uint32_t *,
+                                           _RF_LDS uint32_t *),
+                           const uint32_t iv) {}
+void __kmpc_xteamr_l_4x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                          void (*_rf)(long *, long),
+                          void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                          const long iv) {}
+void __kmpc_xteamr_ul_4x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                           void (*_rf_lds)(_RF_LDS uint64_t *,
+                                           _RF_LDS uint64_t *),
+                           const uint64_t iv) {}
+void __kmpc_xteamr_d_32x32(double v, double *r_ptr, double *tvals,
+                           uint32_t *td_ptr, void (*_rf)(double *, double),
+                           void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                           const double iv) {}
+void __kmpc_xteamr_f_32x32(float v, float *r_ptr, float *tvals,
+                           uint32_t *td_ptr, void (*_rf)(float *, float),
+                           void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                           const float iv) {}
+void __kmpc_xteamr_cd_32x32(double _Complex v, double _Complex *r_ptr,
+                            double _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(double _Complex *, double _Complex),
+                            void (*_rf_lds)(_RF_LDS double _Complex *,
+                                            _RF_LDS double _Complex *),
+                            const double _Complex iv) {}
+void __kmpc_xteamr_cf_32x32(float _Complex v, float _Complex *r_ptr,
+                            float _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(float _Complex *, float _Complex),
+                            void (*_rf_lds)(_RF_LDS float _Complex *,
+                                            _RF_LDS float _Complex *),
+                            const float _Complex iv) {}
+void __kmpc_xteamr_i_32x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                           void (*_rf)(int *, int),
+                           void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                           const int iv) {}
+void __kmpc_xteamr_ui_32x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                            void (*_rf_lds)(_RF_LDS uint32_t *,
+                                            _RF_LDS uint32_t *),
+                            const uint32_t iv) {}
+void __kmpc_xteamr_l_32x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                           void (*_rf)(long *, long),
+                           void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                           const long iv) {}
+void __kmpc_xteamr_ul_32x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                            void (*_rf_lds)(_RF_LDS uint64_t *,
+                                            _RF_LDS uint64_t *),
+                            const uint64_t iv) {}
+void __kmpc_xteamr_d_16x32(double v, double *r_ptr, double *tvals,
+                           uint32_t *td_ptr, void (*_rf)(double *, double),
+                           void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                           const double iv) {}
+void __kmpc_xteamr_f_16x32(float v, float *r_ptr, float *tvals,
+                           uint32_t *td_ptr, void (*_rf)(float *, float),
+                           void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                           const float iv) {}
+void __kmpc_xteamr_cd_16x32(double _Complex v, double _Complex *r_ptr,
+                            double _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(double _Complex *, double _Complex),
+                            void (*_rf_lds)(_RF_LDS double _Complex *,
+                                            _RF_LDS double _Complex *),
+                            const double _Complex iv) {}
+void __kmpc_xteamr_cf_16x32(float _Complex v, float _Complex *r_ptr,
+                            float _Complex *tvals, uint32_t *td_ptr,
+                            void (*_rf)(float _Complex *, float _Complex),
+                            void (*_rf_lds)(_RF_LDS float _Complex *,
+                                            _RF_LDS float _Complex *),
+                            const float _Complex iv) {}
+void __kmpc_xteamr_i_16x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                           void (*_rf)(int *, int),
+                           void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                           const int iv) {}
+void __kmpc_xteamr_ui_16x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                            void (*_rf_lds)(_RF_LDS uint32_t *,
+                                            _RF_LDS uint32_t *),
+                            const uint32_t iv) {}
+void __kmpc_xteamr_l_16x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                           void (*_rf)(long *, long),
+                           void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                           const long iv) {}
+void __kmpc_xteamr_ul_16x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                            uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                            void (*_rf_lds)(_RF_LDS uint64_t *,
+                                            _RF_LDS uint64_t *),
+                            const uint64_t iv) {}
+void __kmpc_xteamr_d_8x32(double v, double *r_ptr, double *tvals,
+                          uint32_t *td_ptr, void (*_rf)(double *, double),
+                          void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                          const double iv) {}
+void __kmpc_xteamr_f_8x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                          void (*_rf)(float *, float),
+                          void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                          const float iv) {}
+void __kmpc_xteamr_cd_8x32(double _Complex v, double _Complex *r_ptr,
+                           double _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(double _Complex *, double _Complex),
+                           void (*_rf_lds)(_RF_LDS double _Complex *,
+                                           _RF_LDS double _Complex *),
+                           const double _Complex iv) {}
+void __kmpc_xteamr_cf_8x32(float _Complex v, float _Complex *r_ptr,
+                           float _Complex *tvals, uint32_t *td_ptr,
+                           void (*_rf)(float _Complex *, float _Complex),
+                           void (*_rf_lds)(_RF_LDS float _Complex *,
+                                           _RF_LDS float _Complex *),
+                           const float _Complex iv) {}
+void __kmpc_xteamr_i_8x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                          void (*_rf)(int *, int),
+                          void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *),
+                          const int iv) {}
+void __kmpc_xteamr_ui_8x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                           void (*_rf_lds)(_RF_LDS uint32_t *,
+                                           _RF_LDS uint32_t *),
+                           const uint32_t iv) {}
+void __kmpc_xteamr_l_8x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                          void (*_rf)(long *, long),
+                          void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *),
+                          const long iv) {}
+void __kmpc_xteamr_ul_8x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                           uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                           void (*_rf_lds)(_RF_LDS uint64_t *,
+                                           _RF_LDS uint64_t *),
+                           const uint64_t iv) {}
+
+///  Builtin pair reduction functions.
+///    These become function pointers for arg5 and arg6 of xteamr above.
+///    Each pair reduction function must have two variants to support xteamr.
+///    The 1st is for TLS memory and the 2nd is for LDS (scratchpad) memory.
+///    These are defined in Rfuns.cpp.  User defined reductions require
+///    that Clang codegen generate these functions.
+void __kmpc_rfun_sum_d(double *val, double otherval) {}
+void __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {}
+void __kmpc_rfun_sum_f(float *val, float otherval) {}
+void __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) {}
+void __kmpc_rfun_sum_cd(double _Complex *val, double _Complex otherval) {}
+void __kmpc_rfun_sum_lds_cd(_RF_LDS double _Complex *val,
+                            _RF_LDS double _Complex *otherval) {}
+void __kmpc_rfun_sum_cf(float _Complex *val, float _Complex otherval) {}
+void __kmpc_rfun_sum_lds_cf(_RF_LDS float _Complex *val,
+                            _RF_LDS float _Complex *otherval) {}
+void __kmpc_rfun_sum_i(int *val, int otherval) {}
+void __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) {}
+void __kmpc_rfun_sum_ui(unsigned int *val, unsigned int otherval) {}
+void __kmpc_rfun_sum_lds_ui(_RF_LDS unsigned int *val,
+                            _RF_LDS unsigned int *otherval) {}
+void __kmpc_rfun_sum_l(long *val, long otherval) {}
+void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) {}
+void __kmpc_rfun_sum_ul(unsigned long *val, unsigned long otherval) {}
+void __kmpc_rfun_sum_lds_ul(_RF_LDS unsigned long *val,
+                            _RF_LDS unsigned long *otherval) {}
+/// Complex variables have no compare, so no min or max for cf and cd.
+void __kmpc_rfun_min_d(double *val, double otherval) {}
+void __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {}
+void __kmpc_rfun_min_f(float *val, float otherval) {}
+void __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) {}
+void __kmpc_rfun_min_i(int *val, int otherval) {}
+void __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) {}
+void __kmpc_rfun_min_ui(unsigned int *val, unsigned int otherval) {}
+void __kmpc_rfun_min_lds_ui(_RF_LDS unsigned int *val,
+                            _RF_LDS unsigned int *otherval) {}
+void __kmpc_rfun_min_l(long *val, long otherval) {}
+void __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) {}
+void __kmpc_rfun_min_ul(unsigned long *val, unsigned long otherval) {}
+void __kmpc_rfun_min_lds_ul(_RF_LDS unsigned long *val,
+                            _RF_LDS unsigned long *otherval) {}
+void __kmpc_rfun_max_d(double *val, double otherval) {}
+void __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {}
+void __kmpc_rfun_max_f(float *val, float otherval) {}
+void __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) {}
+void __kmpc_rfun_max_i(int *val, int otherval) {}
+void __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) {}
+void __kmpc_rfun_max_ui(unsigned int *val, unsigned int otherval) {}
+void __kmpc_rfun_max_lds_ui(_RF_LDS unsigned int *val,
+                            _RF_LDS unsigned int *otherval) {}
+void __kmpc_rfun_max_l(long *val, long otherval) {}
+void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) {}
+void __kmpc_rfun_max_ul(unsigned long *val, unsigned long otherval) {}
+void __kmpc_rfun_max_lds_ul(_RF_LDS unsigned long *val,
+                            _RF_LDS unsigned long *otherval) {}
+#undef _RF_LDS
+
+// host version of __kmpc_get_warp_size
+int __kmpc_get_warp_size() {
+  printf(" executing on host \n");
+  return -1;
+}
+
+} // end of extern "C"
+
+#endif // end of host definitions