diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -95,6 +95,7 @@
   ${include_directory}/Synchronization.h
   ${include_directory}/Types.h
   ${include_directory}/Utils.h
+  ${include_directory}/Xteamr.h
 )
 
 set(src_files
@@ -111,6 +112,7 @@
   ${source_directory}/Tasking.cpp
   ${source_directory}/Utils.cpp
   ${source_directory}/Workshare.cpp
+  ${source_directory}/Xteamr.cpp
 )
 
 set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512)
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -13,6 +13,7 @@
 #define OMPTARGET_DEVICERTL_INTERFACE_H
 
 #include "Types.h"
+#include "Xteamr.h"
 
 /// External API
 ///
diff --git a/openmp/libomptarget/DeviceRTL/include/Xteamr.h b/openmp/libomptarget/DeviceRTL/include/Xteamr.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Xteamr.h
@@ -0,0 +1,250 @@
+//===---------------- Xteamr.h - OpenMP interface ----------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Modifications Copyright (c) 2022 Advanced Micro Devices, All rights reserved.
+// Notified per clause 4(b) of the license.
+//
+//===----------------------------------------------------------------------===//
+//
+// DeviceRTL Header file: Xteamr.h
+//     External __kmpc headers for cross team reduction functions defined
+//     in DeviceRTL/src/Xteamr.cpp. Clang generates a call to one of these
+//     functions when it encounter a reduction. The specific function depends
+//     on datatype, warpsize, and number of waves in the teamsize.  The number
+//     of waves must be a power of 2 and the total number of threads must
+//     be greater than or equal to the number of teams.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_XTEAMR_H
+#define OMPTARGET_DEVICERTL_XTEAMR_H
+#include "Types.h"
+
+#define _CD double _Complex
+#define _CF float _Complex
+#define _UI unsigned int
+#define _UL unsigned long
+#define _INLINE_ATTR_ __attribute__((flatten, always_inline))
+#define _RF_LDS volatile __attribute__((address_space(3)))
+
+///  __kmpc_xteamr_<dtype>_<waves>x<WSZ>: cross team reduction functions
+///    <dtype> letter(s) representing data type
+///    <waves> number of warps/waves in thread block
+///    <WSZ>   warp size, so <waves> x WSZ is the team size in threads
+///                       example: 16x64 is a teamsize = 1024 threads
+///    arg1: the thread local reduction value.
+///    arg2: pointer to where result is to be written.
+///    arg3: global array of team values for this reduction instance.
+///    arg4: pointer to atomic counter of completed teams
+///    arg5: function pointer to reduction type function (sum,min,max)
+///    arg6: function pointer to reduction type function on LDS memory
+///    arg7: Reduction null value
+extern "C" {
+void _INLINE_ATTR_ __kmpc_xteamr_d_16x64(
+    double v, double *r_ptr, double *tvs, uint32_t *td,
+    void (*_rf)(double *, double),
+    void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_f_16x64(
+    float v, float *r_ptr, float *tvs, uint32_t *td,
+    void (*_rf)(float *, float),
+    void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_cd_16x64(
+    _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+    void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_cf_16x64(
+    _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+    void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_i_16x64(
+    int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+    void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_ui_16x64(
+    _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+    void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_l_16x64(
+    long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+    void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_ul_16x64(
+    _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+    void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_d_32x32(
+    double v, double *r_ptr, double *tvs, uint32_t *td,
+    void (*_rf)(double *, double),
+    void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_f_32x32(
+    float v, float *r_ptr, float *tvs, uint32_t *td,
+    void (*_rf)(float *, float),
+    void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_cd_32x32(
+    _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+    void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_cf_32x32(
+    _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+    void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_i_32x32(
+    int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+    void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_ui_32x32(
+    _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+    void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_l_32x32(
+    long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+    void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_ul_32x32(
+    _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+    void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_d_8x64(
+    double v, double *r_ptr, double *tvs, uint32_t *td,
+    void (*_rf)(double *, double),
+    void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_f_8x64(
+    float v, float *r_ptr, float *tvs, uint32_t *td,
+    void (*_rf)(float *, float),
+    void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_cd_8x64(
+    _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+    void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_cf_8x64(
+    _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+    void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_i_8x64(
+    int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+    void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_ui_8x64(
+    _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+    void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_l_8x64(
+    long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+    void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_ul_8x64(
+    _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+    void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_d_16x32(
+    double v, double *r_ptr, double *tvs, uint32_t *td,
+    void (*_rf)(double *, double),
+    void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_f_16x32(
+    float v, float *r_ptr, float *tvs, uint32_t *td,
+    void (*_rf)(float *, float),
+    void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_cd_16x32(
+    _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+    void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_cf_16x32(
+    _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+    void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_i_16x32(
+    int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+    void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_ui_16x32(
+    _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+    void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_l_16x32(
+    long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+    void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_ul_16x32(
+    _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+    void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_d_4x64(
+    double v, double *r_ptr, double *tvs, uint32_t *td,
+    void (*_rf)(double *, double),
+    void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_f_4x64(
+    float v, float *r_ptr, float *tvs, uint32_t *td,
+    void (*_rf)(float *, float),
+    void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_cd_4x64(
+    _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+    void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_cf_4x64(
+    _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+    void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_i_4x64(
+    int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+    void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_ui_4x64(
+    _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+    void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_l_4x64(
+    long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+    void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_ul_4x64(
+    _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+    void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_d_8x32(
+    double v, double *r_ptr, double *tvs, uint32_t *td,
+    void (*_rf)(double *, double),
+    void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_f_8x32(
+    float v, float *r_ptr, float *tvs, uint32_t *td,
+    void (*_rf)(float *, float),
+    void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_cd_8x32(
+    _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+    void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_cf_8x32(
+    _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+    void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_i_8x32(
+    int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+    void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_ui_8x32(
+    _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+    void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_l_8x32(
+    long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+    void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv);
+void _INLINE_ATTR_ __kmpc_xteamr_ul_8x32(
+    _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+    void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv);
+void __kmpc_rfun_sum_d(double *val, double otherval);
+void __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval);
+void __kmpc_rfun_sum_f(float *val, float otherval);
+void __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval);
+void __kmpc_rfun_sum_cd(_CD *val, _CD otherval);
+void __kmpc_rfun_sum_lds_cd(_RF_LDS _CD *val, _RF_LDS _CD *otherval);
+void __kmpc_rfun_sum_cf(_CF *val, _CF otherval);
+void __kmpc_rfun_sum_lds_cf(_RF_LDS _CF *val, _RF_LDS _CF *otherval);
+void __kmpc_rfun_sum_i(int *val, int otherval);
+void __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval);
+void __kmpc_rfun_sum_ui(_UI *val, _UI otherval);
+void __kmpc_rfun_sum_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval);
+void __kmpc_rfun_sum_l(long *val, long otherval);
+void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
+void __kmpc_rfun_sum_ul(_UL *val, _UL otherval);
+void __kmpc_rfun_sum_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval);
+void __kmpc_rfun_max_d(double *val, double otherval);
+void __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval);
+void __kmpc_rfun_max_f(float *val, float otherval);
+void __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval);
+void __kmpc_rfun_max_i(int *val, int otherval);
+void __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval);
+void __kmpc_rfun_max_ui(_UI *val, _UI otherval);
+void __kmpc_rfun_max_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval);
+void __kmpc_rfun_max_l(long *val, long otherval);
+void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
+void __kmpc_rfun_max_ul(_UL *val, _UL otherval);
+void __kmpc_rfun_max_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval);
+void __kmpc_rfun_min_d(double *val, double otherval);
+void __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval);
+void __kmpc_rfun_min_f(float *val, float otherval);
+void __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval);
+void __kmpc_rfun_min_i(int *val, int otherval);
+void __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval);
+void __kmpc_rfun_min_ui(_UI *val, _UI otherval);
+void __kmpc_rfun_min_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval);
+void __kmpc_rfun_min_l(long *val, long otherval);
+void __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
+void __kmpc_rfun_min_ul(_UL *val, _UL otherval);
+void __kmpc_rfun_min_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval);
+} // end extern C
+
+#undef _CD
+#undef _CF
+#undef _UI
+#undef _UL
+#undef _INLINE_ATTR_
+#undef _RF_LDS
+
+#endif // of ifndef OMPTARGET_DEVICERTL_XTEAMR_H
diff --git a/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp b/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp
@@ -0,0 +1,851 @@
+//===---- Xteamr.cpp - OpenMP cross team helper functions ---- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains helper functions for cross team reductions
+//
+//===----------------------------------------------------------------------===//
+
+#include "Xteamr.h"
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+#define __XTEAM_SHARED_LDS volatile __attribute__((address_space(3)))
+
+using namespace _OMP;
+
+#pragma omp begin declare target device_type(nohost)
+
+// Headers for specialized shfl_xor
+double xteamr_shfl_xor_d(double var, const int lane_mask, const uint32_t width);
+float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width);
+int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width);
+double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask,
+                                   const uint32_t width);
+float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask,
+                                  const uint32_t width);
+
+// Define the arch (amdgcn vs nvptx) variants of shfl
+
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) {
+  int self = mapping::getThreadIdInWarp(); // __lane_id();
+  int index = self ^ lane_mask;
+  index = index >= ((self + width) & ~(width - 1)) ? self : index;
+  return __builtin_amdgcn_ds_bpermute(index << 2, var);
+}
+float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = var;
+  tmp.i = xteamr_shfl_xor_int(tmp.i, lane_mask, width);
+  return tmp.f;
+}
+double xteamr_shfl_xor_d(double var, const int lane_mask,
+                         const uint32_t width) {
+  static_assert(sizeof(double) == 2 * sizeof(int), "");
+  static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
+  tmp[0] = xteamr_shfl_xor_int(tmp[0], lane_mask, width);
+  tmp[1] = xteamr_shfl_xor_int(tmp[1], lane_mask, width);
+
+  uint64_t tmp0 =
+      (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+  double tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  return tmp1;
+}
+
+double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask,
+                                   const uint32_t width) {
+  __real__(var) = xteamr_shfl_xor_d(__real__(var), lane_mask, width);
+  __imag__(var) = xteamr_shfl_xor_d(__imag__(var), lane_mask, width);
+  return var;
+}
+float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask,
+                                  const uint32_t width) {
+  __real__(var) = xteamr_shfl_xor_f(__real__(var), lane_mask, width);
+  __imag__(var) = xteamr_shfl_xor_f(__imag__(var), lane_mask, width);
+  return var;
+}
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) {
+  int c = ((32 - width) << 8) | 0x1f;
+  return __nvvm_shfl_sync_bfly_i32(0xFFFFFFFF, var, lane_mask, c);
+}
+float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = var;
+  tmp.i = xteamr_shfl_xor_int(tmp.i, lane_mask, width);
+  return tmp.f;
+}
+double xteamr_shfl_xor_d(double var, int laneMask, const uint32_t width) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+  hi = xteamr_shfl_xor_int(hi, laneMask, width);
+  lo = xteamr_shfl_xor_int(lo, laneMask, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+  return var;
+}
+double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask,
+                                   const uint32_t width) {
+  __real__(var) = xteamr_shfl_xor_d(__real__(var), lane_mask, width);
+  __imag__(var) = xteamr_shfl_xor_d(__imag__(var), lane_mask, width);
+  return var;
+}
+float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask,
+                                  const uint32_t width) {
+  __real__(var) = xteamr_shfl_xor_f(__real__(var), lane_mask, width);
+  __imag__(var) = xteamr_shfl_xor_f(__imag__(var), lane_mask, width);
+  return var;
+}
+#pragma omp end declare variant
+// } // end impl namespace
+
+// tag dispatching of type specific shfl_xor, get_low, and get_high
+struct _d_tag {};
+struct _f_tag {};
+struct _cd_tag {};
+struct _cf_tag {};
+struct _i_tag {};
+struct _ui_tag {};
+struct _l_tag {};
+struct _ul_tag {};
+template <typename T> struct __dispatch_tag;
+template <> struct __dispatch_tag<double> {
+  typedef _d_tag type;
+};
+template <> struct __dispatch_tag<float> {
+  typedef _f_tag type;
+};
+template <> struct __dispatch_tag<double _Complex> {
+  typedef _cd_tag type;
+};
+template <> struct __dispatch_tag<float _Complex> {
+  typedef _cf_tag type;
+};
+template <> struct __dispatch_tag<int> {
+  typedef _i_tag type;
+};
+template <> struct __dispatch_tag<unsigned int> {
+  typedef _ui_tag type;
+};
+template <> struct __dispatch_tag<long> {
+  typedef _l_tag type;
+};
+template <> struct __dispatch_tag<unsigned long> {
+  typedef _ul_tag type;
+};
+template <const uint32_t _WSZ>
+double xteamr_shfl_xor(_d_tag tag, double var, const int lane_mask) {
+  return xteamr_shfl_xor_d(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+float xteamr_shfl_xor(_f_tag tag, float var, const int lane_mask) {
+  return xteamr_shfl_xor_f(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+double _Complex xteamr_shfl_xor(_cd_tag tag, double _Complex var,
+                                const int lane_mask) {
+  return xteamr_shfl_xor_cd(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+float _Complex xteamr_shfl_xor(_cf_tag tag, float _Complex var,
+                               const int lane_mask) {
+  return xteamr_shfl_xor_cf(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+int xteamr_shfl_xor(_i_tag tag, int var, const int lane_mask) {
+  return xteamr_shfl_xor_int(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+unsigned int xteamr_shfl_xor(_ui_tag tag, unsigned int var,
+                             const int lane_mask) {
+  return xteamr_shfl_xor_int(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+long xteamr_shfl_xor(_l_tag tag, long var, const int lane_mask) {
+  return xteamr_shfl_xor_d(var, lane_mask, _WSZ);
+}
+template <const uint32_t _WSZ>
+unsigned long xteamr_shfl_xor(_ul_tag tag, unsigned long var,
+                              const int lane_mask) {
+  return xteamr_shfl_xor_d(var, lane_mask, _WSZ);
+}
+
+template <typename T, const uint32_t _WSZ>
+T xteamr_shfl_xor(T var, const int lane_mask) {
+  typedef typename __dispatch_tag<T>::type tag;
+  return xteamr_shfl_xor<_WSZ>(tag(), var, lane_mask);
+}
+
+template <typename T, const int32_t _NW, const int32_t _WSZ>
+__attribute__((flatten, always_inline)) void _xteam_reduction(
+    T val, T *r_ptr, T *team_vals, uint32_t *teams_done_ptr,
+    void (*_rf)(T *, T),
+    void (*_rf_lds)(__XTEAM_SHARED_LDS T *, __XTEAM_SHARED_LDS T *),
+    const T inival) {
+
+  const uint32_t omp_thread_num = mapping::getThreadIdInBlock();
+  const uint32_t omp_team_num = mapping::getBlockId();
+  const uint32_t wave_num = mapping::getWarpId();         // 0 15
+  const uint32_t lane_num = mapping::getThreadIdInWarp(); //  0 63
+  const uint32_t NumTeams = mapping::getNumberOfBlocks();
+  static __XTEAM_SHARED_LDS T xwave_lds[_NW + 1];
+  static __XTEAM_SHARED_LDS bool __is_last_team;
+
+// Cuda may restrict max threads, so clear unused wave values
+#ifdef __NVPTX__
+  if (_NW == 32) {
+    if (omp_thread_num == 0) {
+      for (uint32_t i = (omp_get_num_threads() / 32); i < _NW; i++)
+        xwave_lds[i] = inival;
+    }
+  }
+#endif
+
+  // Binary reduce each wave, then copy to xwave_lds[wave_num]
+  for (unsigned int offset = _WSZ / 2; offset > 0; offset >>= 1)
+    (*_rf)(&val, xteamr_shfl_xor<T, _WSZ>(val, offset));
+  if (lane_num == 0)
+    xwave_lds[wave_num] = val;
+
+  // Binary reduce all wave values into wave_lds[0]
+  _OMP::synchronize::threadsAligned();
+  for (unsigned int offset = _NW / 2; offset > 0; offset >>= 1) {
+    if (omp_thread_num < offset)
+      (*_rf_lds)(&(xwave_lds[omp_thread_num]),
+                 &(xwave_lds[omp_thread_num + offset]));
+  }
+  _OMP::synchronize::threadsAligned();
+
+  // Discover the last team to complete cross wave reduction
+  // The team number of last team is nondeterministic.
+  __is_last_team = false;
+  if (omp_thread_num == 0) {
+    team_vals[omp_team_num] = xwave_lds[0];
+    uint32_t td = atomic::inc(teams_done_ptr, NumTeams - 1u, atomic::seq_cst);
+    if (td == (NumTeams - 1u))
+      __is_last_team = true;
+  }
+
+  // This sync needed so all threads from last team know they
+  // are in the last team.
+  _OMP::synchronize::threadsAligned();
+
+  if (__is_last_team) {
+    // All threads from last completed team enter here.
+    // All other teams exit the helper function.
+    // must have more threads than teams
+    if (omp_thread_num < NumTeams)
+      val = team_vals[omp_thread_num];
+    else
+      val = inival;
+
+    _OMP::synchronize::threadsAligned();
+
+    // Reduce each wave into xwave_lds[wave_num]
+    for (unsigned int offset = _WSZ / 2; offset > 0; offset >>= 1)
+      (*_rf)(&val, xteamr_shfl_xor<T, _WSZ>(val, offset));
+    if (lane_num == 0)
+      xwave_lds[wave_num] = val;
+
+    // To get final result, we know wave_lds[0] is done
+    // But we need a sync here to ensure wave_lds[i!=0] is correct.
+    _OMP::synchronize::threadsAligned();
+
+    // Typically only 2 useable waves when <128 CUs. No gain to parallelizing
+    // these last 2 reductions. So do these on thread 0 into lane 0's val.
+    if (omp_thread_num == 0) {
+      unsigned int usableWaves = ((NumTeams - 1) / _WSZ) + 1;
+      // Reduce with the original result value.
+      xwave_lds[usableWaves] = *r_ptr;
+      for (unsigned int kk = 1; kk <= usableWaves; kk++)
+        (*_rf_lds)(&xwave_lds[0], &xwave_lds[kk]);
+
+      *r_ptr = xwave_lds[0];
+    }
+
+    // This sync needed to prevent warps in last team from starting
+    // if there was another reduction.
+    _OMP::synchronize::threadsAligned();
+  }
+}
+
+//  Calls to these __kmpc extern C functions are created in clang codegen
+//  for FORTRAN, c, and C++. They may also be used for sumulation and testing.
+//  The headers for these extern C functions are in ../include/Interface.h
+//  The compiler builds the name based on data type,
+//  number of waves in the team,and warpsize.
+//
+#define _EXT_ATTR extern "C" __attribute__((flatten, always_inline)) void
+_EXT_ATTR
+__kmpc_xteamr_d_16x64(double v, double *r_ptr, double *tvals, uint32_t *td_ptr,
+                      void (*_rf)(double *, double),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS double *,
+                                      __XTEAM_SHARED_LDS double *),
+                      const double iv) {
+  _xteam_reduction<double, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_f_16x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                      void (*_rf)(float *, float),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS float *,
+                                      __XTEAM_SHARED_LDS float *),
+                      const float iv) {
+  _xteam_reduction<float, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+_EXT_ATTR
+__kmpc_xteamr_cd_16x64(double _Complex v, double _Complex *r_ptr,
+                       double _Complex *tvals, uint32_t *td_ptr,
+                       void (*_rf)(double _Complex *, double _Complex),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *,
+                                       __XTEAM_SHARED_LDS double _Complex *),
+                       double _Complex iv) {
+  _xteam_reduction<double _Complex, 16, 64>(v, r_ptr, tvals, td_ptr, _rf,
+                                            _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cf_16x64(float _Complex v, float _Complex *r_ptr,
+                       float _Complex *tvals, uint32_t *td_ptr,
+                       void (*_rf)(float _Complex *, float _Complex),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *,
+                                       __XTEAM_SHARED_LDS float _Complex *),
+                       float _Complex iv) {
+  _xteam_reduction<float _Complex, 16, 64>(v, r_ptr, tvals, td_ptr, _rf,
+                                           _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_i_16x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                      void (*_rf)(int *, int),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS int *,
+                                      __XTEAM_SHARED_LDS int *),
+                      const int iv) {
+  _xteam_reduction<int, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ui_16x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                       uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *,
+                                       __XTEAM_SHARED_LDS uint32_t *),
+                       const uint32_t iv) {
+  _xteam_reduction<uint32_t, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_l_16x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                      void (*_rf)(long *, long),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS long *,
+                                      __XTEAM_SHARED_LDS long *),
+                      const long iv) {
+  _xteam_reduction<long, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ul_16x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                       uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *,
+                                       __XTEAM_SHARED_LDS uint64_t *),
+                       const uint64_t iv) {
+  _xteam_reduction<uint64_t, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+_EXT_ATTR
+__kmpc_xteamr_d_8x64(double v, double *r_ptr, double *tvals, uint32_t *td_ptr,
+                     void (*_rf)(double *, double),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS double *,
+                                     __XTEAM_SHARED_LDS double *),
+                     const double iv) {
+  _xteam_reduction<double, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_f_8x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                     void (*_rf)(float *, float),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS float *,
+                                     __XTEAM_SHARED_LDS float *),
+                     const float iv) {
+  _xteam_reduction<float, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cd_8x64(double _Complex v, double _Complex *r_ptr,
+                      double _Complex *tvals, uint32_t *td_ptr,
+                      void (*_rf)(double _Complex *, double _Complex),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *,
+                                      __XTEAM_SHARED_LDS double _Complex *),
+                      const double _Complex iv) {
+  _xteam_reduction<double _Complex, 8, 64>(v, r_ptr, tvals, td_ptr, _rf,
+                                           _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cf_8x64(float _Complex v, float _Complex *r_ptr,
+                      float _Complex *tvals, uint32_t *td_ptr,
+                      void (*_rf)(float _Complex *, float _Complex),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *,
+                                      __XTEAM_SHARED_LDS float _Complex *),
+                      const float _Complex iv) {
+  _xteam_reduction<float _Complex, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds,
+                                          iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_i_8x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                     void (*_rf)(int *, int),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS int *,
+                                     __XTEAM_SHARED_LDS int *),
+                     const int iv) {
+  _xteam_reduction<int, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ui_8x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                      uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *,
+                                      __XTEAM_SHARED_LDS uint32_t *),
+                      const uint32_t iv) {
+  _xteam_reduction<uint32_t, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_l_8x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                     void (*_rf)(long *, long),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS long *,
+                                     __XTEAM_SHARED_LDS long *),
+                     const long iv) {
+  _xteam_reduction<long, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ul_8x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                      uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *,
+                                      __XTEAM_SHARED_LDS uint64_t *),
+                      const uint64_t iv) {
+  _xteam_reduction<uint64_t, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+_EXT_ATTR
+__kmpc_xteamr_d_4x64(double v, double *r_ptr, double *tvals, uint32_t *td_ptr,
+                     void (*_rf)(double *, double),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS double *,
+                                     __XTEAM_SHARED_LDS double *),
+                     const double iv) {
+  _xteam_reduction<double, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_f_4x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                     void (*_rf)(float *, float),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS float *,
+                                     __XTEAM_SHARED_LDS float *),
+                     const float iv) {
+  _xteam_reduction<float, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cd_4x64(double _Complex v, double _Complex *r_ptr,
+                      double _Complex *tvals, uint32_t *td_ptr,
+                      void (*_rf)(double _Complex *, double _Complex),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *,
+                                      __XTEAM_SHARED_LDS double _Complex *),
+                      const double _Complex iv) {
+  _xteam_reduction<double _Complex, 4, 64>(v, r_ptr, tvals, td_ptr, _rf,
+                                           _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cf_4x64(float _Complex v, float _Complex *r_ptr,
+                      float _Complex *tvals, uint32_t *td_ptr,
+                      void (*_rf)(float _Complex *, float _Complex),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *,
+                                      __XTEAM_SHARED_LDS float _Complex *),
+                      const float _Complex iv) {
+  _xteam_reduction<float _Complex, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds,
+                                          iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_i_4x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                     void (*_rf)(int *, int),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS int *,
+                                     __XTEAM_SHARED_LDS int *),
+                     const int iv) {
+  _xteam_reduction<int, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ui_4x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                      uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *,
+                                      __XTEAM_SHARED_LDS uint32_t *),
+                      const uint32_t iv) {
+  _xteam_reduction<uint32_t, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_l_4x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                     void (*_rf)(long *, long),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS long *,
+                                     __XTEAM_SHARED_LDS long *),
+                     const long iv) {
+  _xteam_reduction<long, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ul_4x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                      uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *,
+                                      __XTEAM_SHARED_LDS uint64_t *),
+                      const uint64_t iv) {
+  _xteam_reduction<uint64_t, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+_EXT_ATTR
+__kmpc_xteamr_d_32x32(double v, double *r_ptr, double *tvals, uint32_t *td_ptr,
+                      void (*_rf)(double *, double),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS double *,
+                                      __XTEAM_SHARED_LDS double *),
+                      const double iv) {
+  _xteam_reduction<double, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_f_32x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                      void (*_rf)(float *, float),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS float *,
+                                      __XTEAM_SHARED_LDS float *),
+                      const float iv) {
+  _xteam_reduction<float, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cd_32x32(double _Complex v, double _Complex *r_ptr,
+                       double _Complex *tvals, uint32_t *td_ptr,
+                       void (*_rf)(double _Complex *, double _Complex),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *,
+                                       __XTEAM_SHARED_LDS double _Complex *),
+                       const double _Complex iv) {
+  _xteam_reduction<double _Complex, 32, 32>(v, r_ptr, tvals, td_ptr, _rf,
+                                            _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cf_32x32(float _Complex v, float _Complex *r_ptr,
+                       float _Complex *tvals, uint32_t *td_ptr,
+                       void (*_rf)(float _Complex *, float _Complex),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *,
+                                       __XTEAM_SHARED_LDS float _Complex *),
+                       const float _Complex iv) {
+  _xteam_reduction<float _Complex, 32, 32>(v, r_ptr, tvals, td_ptr, _rf,
+                                           _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_i_32x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                      void (*_rf)(int *, int),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS int *,
+                                      __XTEAM_SHARED_LDS int *),
+                      const int iv) {
+  _xteam_reduction<int, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ui_32x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                       uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *,
+                                       __XTEAM_SHARED_LDS uint32_t *),
+                       const uint32_t iv) {
+  _xteam_reduction<uint32_t, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_l_32x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                      void (*_rf)(long *, long),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS long *,
+                                      __XTEAM_SHARED_LDS long *),
+                      const long iv) {
+  _xteam_reduction<long, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ul_32x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                       uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *,
+                                       __XTEAM_SHARED_LDS uint64_t *),
+                       const uint64_t iv) {
+  _xteam_reduction<uint64_t, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+_EXT_ATTR
+__kmpc_xteamr_d_16x32(double v, double *r_ptr, double *tvals, uint32_t *td_ptr,
+                      void (*_rf)(double *, double),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS double *,
+                                      __XTEAM_SHARED_LDS double *),
+                      const double iv) {
+  _xteam_reduction<double, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_f_16x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                      void (*_rf)(float *, float),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS float *,
+                                      __XTEAM_SHARED_LDS float *),
+                      const float iv) {
+  _xteam_reduction<float, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cd_16x32(double _Complex v, double _Complex *r_ptr,
+                       double _Complex *tvals, uint32_t *td_ptr,
+                       void (*_rf)(double _Complex *, double _Complex),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *,
+                                       __XTEAM_SHARED_LDS double _Complex *),
+                       const double _Complex iv) {
+  _xteam_reduction<double _Complex, 16, 32>(v, r_ptr, tvals, td_ptr, _rf,
+                                            _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cf_16x32(float _Complex v, float _Complex *r_ptr,
+                       float _Complex *tvals, uint32_t *td_ptr,
+                       void (*_rf)(float _Complex *, float _Complex),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *,
+                                       __XTEAM_SHARED_LDS float _Complex *),
+                       const float _Complex iv) {
+  _xteam_reduction<float _Complex, 16, 32>(v, r_ptr, tvals, td_ptr, _rf,
+                                           _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_i_16x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                      void (*_rf)(int *, int),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS int *,
+                                      __XTEAM_SHARED_LDS int *),
+                      const int iv) {
+  _xteam_reduction<int, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ui_16x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                       uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *,
+                                       __XTEAM_SHARED_LDS uint32_t *),
+                       const uint32_t iv) {
+  _xteam_reduction<uint32_t, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_l_16x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                      void (*_rf)(long *, long),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS long *,
+                                      __XTEAM_SHARED_LDS long *),
+                      const long iv) {
+  _xteam_reduction<long, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ul_16x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                       uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                       void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *,
+                                       __XTEAM_SHARED_LDS uint64_t *),
+                       const uint64_t iv) {
+  _xteam_reduction<uint64_t, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+_EXT_ATTR
+__kmpc_xteamr_d_8x32(double v, double *r_ptr, double *tvals, uint32_t *td_ptr,
+                     void (*_rf)(double *, double),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS double *,
+                                     __XTEAM_SHARED_LDS double *),
+                     const double iv) {
+  _xteam_reduction<double, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_f_8x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr,
+                     void (*_rf)(float *, float),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS float *,
+                                     __XTEAM_SHARED_LDS float *),
+                     const float iv) {
+  _xteam_reduction<float, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cd_8x32(double _Complex v, double _Complex *r_ptr,
+                      double _Complex *tvals, uint32_t *td_ptr,
+                      void (*_rf)(double _Complex *, double _Complex),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *,
+                                      __XTEAM_SHARED_LDS double _Complex *),
+                      const double _Complex iv) {
+  _xteam_reduction<double _Complex, 8, 32>(v, r_ptr, tvals, td_ptr, _rf,
+                                           _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_cf_8x32(float _Complex v, float _Complex *r_ptr,
+                      float _Complex *tvals, uint32_t *td_ptr,
+                      void (*_rf)(float _Complex *, float _Complex),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *,
+                                      __XTEAM_SHARED_LDS float _Complex *),
+                      const float _Complex iv) {
+  _xteam_reduction<float _Complex, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds,
+                                          iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_i_8x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr,
+                     void (*_rf)(int *, int),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS int *,
+                                     __XTEAM_SHARED_LDS int *),
+                     const int iv) {
+  _xteam_reduction<int, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ui_8x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals,
+                      uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *,
+                                      __XTEAM_SHARED_LDS uint32_t *),
+                      const uint32_t iv) {
+  _xteam_reduction<uint32_t, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_l_8x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr,
+                     void (*_rf)(long *, long),
+                     void (*_rf_lds)(__XTEAM_SHARED_LDS long *,
+                                     __XTEAM_SHARED_LDS long *),
+                     const long iv) {
+  _xteam_reduction<long, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+_EXT_ATTR
+__kmpc_xteamr_ul_8x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals,
+                      uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t),
+                      void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *,
+                                      __XTEAM_SHARED_LDS uint64_t *),
+                      const uint64_t iv) {
+  _xteam_reduction<uint64_t, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv);
+}
+
+// Built-in pair reduction functions used as function pointers for
+// cross team reduction functions.
+
+#define _RF_LDS volatile __attribute__((address_space(3)))
+
+_EXT_ATTR __kmpc_rfun_sum_d(double *val, double otherval) { *val += otherval; }
+_EXT_ATTR __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {
+  *val += *otherval;
+}
+_EXT_ATTR __kmpc_rfun_sum_f(float *val, float otherval) { *val += otherval; }
+_EXT_ATTR __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) {
+  *val += *otherval;
+}
+_EXT_ATTR __kmpc_rfun_sum_cd(double _Complex *val, double _Complex otherval) {
+  *val += otherval;
+}
+_EXT_ATTR __kmpc_rfun_sum_lds_cd(_RF_LDS double _Complex *val,
+                                 _RF_LDS double _Complex *otherval) {
+  *val += *otherval;
+}
+_EXT_ATTR __kmpc_rfun_sum_cf(float _Complex *val, float _Complex otherval) {
+  *val += otherval;
+}
+_EXT_ATTR __kmpc_rfun_sum_lds_cf(_RF_LDS float _Complex *val,
+                                 _RF_LDS float _Complex *otherval) {
+  *val += *otherval;
+}
+_EXT_ATTR __kmpc_rfun_sum_i(int *val, int otherval) { *val += otherval; }
+_EXT_ATTR __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) {
+  *val += *otherval;
+}
+_EXT_ATTR __kmpc_rfun_sum_ui(unsigned int *val, unsigned int otherval) {
+  *val += otherval;
+}
+_EXT_ATTR __kmpc_rfun_sum_lds_ui(_RF_LDS unsigned int *val,
+                                 _RF_LDS unsigned int *otherval) {
+  *val += *otherval;
+}
+_EXT_ATTR __kmpc_rfun_sum_l(long *val, long otherval) { *val += otherval; }
+_EXT_ATTR __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) {
+  *val += *otherval;
+}
+_EXT_ATTR __kmpc_rfun_sum_ul(unsigned long *val, unsigned long otherval) {
+  *val += otherval;
+}
+_EXT_ATTR __kmpc_rfun_sum_lds_ul(_RF_LDS unsigned long *val,
+                                 _RF_LDS unsigned long *otherval) {
+  *val += *otherval;
+}
+
+_EXT_ATTR __kmpc_rfun_min_d(double *val, double otherval) {
+  *val = (otherval < *val) ? otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {
+  *val = (*otherval < *val) ? *otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_min_f(float *val, float otherval) {
+  *val = (otherval < *val) ? otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) {
+  *val = (*otherval < *val) ? *otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_min_i(int *val, int otherval) {
+  *val = (otherval < *val) ? otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) {
+  *val = (*otherval < *val) ? *otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_min_ui(unsigned int *val, unsigned int otherval) {
+  *val = (otherval < *val) ? otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_min_lds_ui(_RF_LDS unsigned int *val,
+                                 _RF_LDS unsigned int *otherval) {
+  *val = (*otherval < *val) ? *otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_min_l(long *val, long otherval) {
+  *val = (otherval < *val) ? otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) {
+  *val = (*otherval < *val) ? *otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_min_ul(unsigned long *val, unsigned long otherval) {
+  *val = (otherval < *val) ? otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_min_lds_ul(_RF_LDS unsigned long *val,
+                                 _RF_LDS unsigned long *otherval) {
+  *val = (*otherval < *val) ? *otherval : *val;
+}
+
+_EXT_ATTR __kmpc_rfun_max_d(double *val, double otherval) {
+  *val = (otherval > *val) ? otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {
+  *val = (*otherval > *val) ? *otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_max_f(float *val, float otherval) {
+  *val = (otherval > *val) ? otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) {
+  *val = (*otherval > *val) ? *otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_max_i(int *val, int otherval) {
+  *val = (otherval > *val) ? otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) {
+  *val = (*otherval > *val) ? *otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_max_ui(unsigned int *val, unsigned int otherval) {
+  *val = (otherval > *val) ? otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_max_lds_ui(_RF_LDS unsigned int *val,
+                                 _RF_LDS unsigned int *otherval) {
+  *val = (*otherval > *val) ? *otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_max_l(long *val, long otherval) {
+  *val = (otherval > *val) ? otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) {
+  *val = (*otherval > *val) ? *otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_max_ul(unsigned long *val, unsigned long otherval) {
+  *val = (otherval > *val) ? otherval : *val;
+}
+_EXT_ATTR __kmpc_rfun_max_lds_ul(_RF_LDS unsigned long *val,
+                                 _RF_LDS unsigned long *otherval) {
+  *val = (*otherval > *val) ? *otherval : *val;
+}
+
+#undef _EXT_ATTR
+#undef _RF_LDS
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -99,6 +99,7 @@
     config.test_flags += " -Wl,-rpath," + config.library_dir
     config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
     config.test_flags += " -Wl,-rpath," + config.llvm_lib_directory
+    config.test_flags += " -latomic"
     if config.cuda_libdir:
         config.test_flags += " -Wl,-rpath," + config.cuda_libdir
     if config.libomptarget_current_target.startswith('amdgcn'):
diff --git a/openmp/libomptarget/test/xteamr/test_xteamr.h b/openmp/libomptarget/test/xteamr/test_xteamr.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/xteamr/test_xteamr.h
@@ -0,0 +1,892 @@
+
+//  Header file: overload_to_externs.h
+//               generated by utility gen_externs
+
+#define _CD double _Complex
+#define _CF float _Complex
+#define _UI unsigned int
+#define _UL unsigned long
+#define _INLINE_ATTR_ __attribute__((flatten, always_inline))
+
+// Headers for extern xteamr functions defined in libomptarget DeviceRTL
+// are defined here in test application because user apps cannot include 
+// the DeviceRTL Interface.h header file.
+
+#if defined(__AMDGCN__) || defined(__NVPTX__)
+extern "C" {
+#define _RF_LDS volatile __attribute__((address_space(3)))
+void _INLINE_ATTR_  __kmpc_xteamr_d_16x64
+   (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double),
+      void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv);
+void _INLINE_ATTR_  __kmpc_xteamr_f_16x64
+   (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float),
+      void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv);
+void _INLINE_ATTR_  __kmpc_xteamr_cd_16x64
+   (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+      void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv);
+void _INLINE_ATTR_  __kmpc_xteamr_cf_16x64
+   (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+      void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv);
+void _INLINE_ATTR_  __kmpc_xteamr_i_16x64
+   (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+      void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv);
+void _INLINE_ATTR_  __kmpc_xteamr_ui_16x64
+   (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+      void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv);
+void _INLINE_ATTR_  __kmpc_xteamr_l_16x64
+   (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+      void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv);
+void _INLINE_ATTR_  __kmpc_xteamr_ul_16x64
+   (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+      void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv);
+void _INLINE_ATTR_  __kmpc_xteamr_d_32x32
+   (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double),
+      void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv);
+void _INLINE_ATTR_  __kmpc_xteamr_f_32x32
+   (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float),
+      void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv);
+void _INLINE_ATTR_  __kmpc_xteamr_cd_32x32
+   (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+      void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv);
+void _INLINE_ATTR_  __kmpc_xteamr_cf_32x32
+   (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+      void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv);
+void _INLINE_ATTR_  __kmpc_xteamr_i_32x32
+   (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+      void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv);
+void _INLINE_ATTR_  __kmpc_xteamr_ui_32x32
+   (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+      void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv);
+void _INLINE_ATTR_  __kmpc_xteamr_l_32x32
+   (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+      void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv);
+void _INLINE_ATTR_  __kmpc_xteamr_ul_32x32
+   (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+      void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv);
+void _INLINE_ATTR_  __kmpc_xteamr_d_8x64
+   (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double),
+      void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv);
+void _INLINE_ATTR_  __kmpc_xteamr_f_8x64
+   (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float),
+      void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv);
+void _INLINE_ATTR_  __kmpc_xteamr_cd_8x64
+   (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+      void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv);
+void _INLINE_ATTR_  __kmpc_xteamr_cf_8x64
+   (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+      void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv);
+void _INLINE_ATTR_  __kmpc_xteamr_i_8x64
+   (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+      void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv);
+void _INLINE_ATTR_  __kmpc_xteamr_ui_8x64
+   (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+      void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv);
+void _INLINE_ATTR_  __kmpc_xteamr_l_8x64
+   (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+      void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv);
+void _INLINE_ATTR_  __kmpc_xteamr_ul_8x64
+   (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+      void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv);
+void _INLINE_ATTR_  __kmpc_xteamr_d_16x32
+   (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double),
+      void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv);
+void _INLINE_ATTR_  __kmpc_xteamr_f_16x32
+   (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float),
+      void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv);
+void _INLINE_ATTR_  __kmpc_xteamr_cd_16x32
+   (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+      void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv);
+void _INLINE_ATTR_  __kmpc_xteamr_cf_16x32
+   (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+      void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv);
+void _INLINE_ATTR_  __kmpc_xteamr_i_16x32
+   (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+      void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv);
+void _INLINE_ATTR_  __kmpc_xteamr_ui_16x32
+   (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+      void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv);
+void _INLINE_ATTR_  __kmpc_xteamr_l_16x32
+   (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+      void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv);
+void _INLINE_ATTR_  __kmpc_xteamr_ul_16x32
+   (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+      void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv);
+void _INLINE_ATTR_  __kmpc_xteamr_d_4x64
+   (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double),
+      void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv);
+void _INLINE_ATTR_  __kmpc_xteamr_f_4x64
+   (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float),
+      void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv);
+void _INLINE_ATTR_  __kmpc_xteamr_cd_4x64
+   (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+      void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv);
+void _INLINE_ATTR_  __kmpc_xteamr_cf_4x64
+   (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+      void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv);
+void _INLINE_ATTR_  __kmpc_xteamr_i_4x64
+   (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+      void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv);
+void _INLINE_ATTR_  __kmpc_xteamr_ui_4x64
+   (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+      void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv);
+void _INLINE_ATTR_  __kmpc_xteamr_l_4x64
+   (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+      void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv);
+void _INLINE_ATTR_  __kmpc_xteamr_ul_4x64
+   (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+      void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv);
+void _INLINE_ATTR_  __kmpc_xteamr_d_8x32
+   (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double),
+      void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv);
+void _INLINE_ATTR_  __kmpc_xteamr_f_8x32
+   (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float),
+      void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv);
+void _INLINE_ATTR_  __kmpc_xteamr_cd_8x32
+   (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+      void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv);
+void _INLINE_ATTR_  __kmpc_xteamr_cf_8x32
+   (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+      void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv);
+void _INLINE_ATTR_  __kmpc_xteamr_i_8x32
+   (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+      void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv);
+void _INLINE_ATTR_  __kmpc_xteamr_ui_8x32
+   (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+      void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv);
+void _INLINE_ATTR_  __kmpc_xteamr_l_8x32
+   (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+      void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv);
+void _INLINE_ATTR_  __kmpc_xteamr_ul_8x32
+   (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+      void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv);
+void __kmpc_rfun_sum_d(double *val, double otherval);
+void __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval);
+void __kmpc_rfun_sum_f(float *val, float otherval);
+void __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval);
+void __kmpc_rfun_sum_cd(_CD *val, _CD otherval);
+void __kmpc_rfun_sum_lds_cd(_RF_LDS _CD *val, _RF_LDS _CD *otherval);
+void __kmpc_rfun_sum_cf(_CF *val, _CF otherval);
+void __kmpc_rfun_sum_lds_cf(_RF_LDS _CF *val, _RF_LDS _CF *otherval);
+void __kmpc_rfun_sum_i(int *val, int otherval);
+void __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval);
+void __kmpc_rfun_sum_ui(_UI *val, _UI otherval);
+void __kmpc_rfun_sum_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval);
+void __kmpc_rfun_sum_l(long *val, long otherval);
+void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
+void __kmpc_rfun_sum_ul(_UL *val, _UL otherval);
+void __kmpc_rfun_sum_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval);
+void __kmpc_rfun_max_d(double *val, double otherval);
+void __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval);
+void __kmpc_rfun_max_f(float *val, float otherval);
+void __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval);
+void __kmpc_rfun_max_i(int *val, int otherval);
+void __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval);
+void __kmpc_rfun_max_ui(_UI *val, _UI otherval);
+void __kmpc_rfun_max_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval);
+void __kmpc_rfun_max_l(long *val, long otherval);
+void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
+void __kmpc_rfun_max_ul(_UL *val, _UL otherval);
+void __kmpc_rfun_max_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval);
+void __kmpc_rfun_min_d(double *val, double otherval);
+void __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval);
+void __kmpc_rfun_min_f(float *val, float otherval);
+void __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval);
+void __kmpc_rfun_min_i(int *val, int otherval);
+void __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval);
+void __kmpc_rfun_min_ui(_UI *val, _UI otherval);
+void __kmpc_rfun_min_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval);
+void __kmpc_rfun_min_l(long *val, long otherval);
+void __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
+void __kmpc_rfun_min_ul(_UL *val, _UL otherval);
+void __kmpc_rfun_min_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval);
+#undef _RF_LDS
+int __kmpc_get_warp_size();
+} // end extern C
+
+#else
+
+// For host compilation, define null functions for host linking.
+
+extern "C" {
+#undef _RF_LDS
+#define _RF_LDS
+void  __kmpc_xteamr_d_16x64
+   (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double),
+      void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv){}
+void  __kmpc_xteamr_f_16x64
+   (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float),
+      void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv){}
+void  __kmpc_xteamr_cd_16x64
+   (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+      void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv){}
+void  __kmpc_xteamr_cf_16x64
+   (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+      void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv){}
+void  __kmpc_xteamr_i_16x64
+   (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+      void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv){}
+void  __kmpc_xteamr_ui_16x64
+   (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+      void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv){}
+void  __kmpc_xteamr_l_16x64
+   (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+      void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv){}
+void  __kmpc_xteamr_ul_16x64
+   (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+      void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv){}
+void  __kmpc_xteamr_d_32x32
+   (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double),
+      void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv){}
+void  __kmpc_xteamr_f_32x32
+   (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float),
+      void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv){}
+void  __kmpc_xteamr_cd_32x32
+   (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+      void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv){}
+void  __kmpc_xteamr_cf_32x32
+   (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+      void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv){}
+void  __kmpc_xteamr_i_32x32
+   (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+      void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv){}
+void  __kmpc_xteamr_ui_32x32
+   (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+      void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv){}
+void  __kmpc_xteamr_l_32x32
+   (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+      void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv){}
+void  __kmpc_xteamr_ul_32x32
+   (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+      void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv){}
+void  __kmpc_xteamr_d_8x64
+   (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double),
+      void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv){}
+void  __kmpc_xteamr_f_8x64
+   (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float),
+      void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv){}
+void  __kmpc_xteamr_cd_8x64
+   (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+      void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv){}
+void  __kmpc_xteamr_cf_8x64
+   (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+      void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv){}
+void  __kmpc_xteamr_i_8x64
+   (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+      void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv){}
+void  __kmpc_xteamr_ui_8x64
+   (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+      void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv){}
+void  __kmpc_xteamr_l_8x64
+   (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+      void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv){}
+void  __kmpc_xteamr_ul_8x64
+   (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+      void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv){}
+void  __kmpc_xteamr_d_16x32
+   (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double),
+      void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv){}
+void  __kmpc_xteamr_f_16x32
+   (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float),
+      void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv){}
+void  __kmpc_xteamr_cd_16x32
+   (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+      void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv){}
+void  __kmpc_xteamr_cf_16x32
+   (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+      void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv){}
+void  __kmpc_xteamr_i_16x32
+   (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+      void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv){}
+void  __kmpc_xteamr_ui_16x32
+   (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+      void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv){}
+void  __kmpc_xteamr_l_16x32
+   (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+      void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv){}
+void  __kmpc_xteamr_ul_16x32
+   (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+      void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv){}
+void  __kmpc_xteamr_d_4x64
+   (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double),
+      void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv){}
+void  __kmpc_xteamr_f_4x64
+   (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float),
+      void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv){}
+void  __kmpc_xteamr_cd_4x64
+   (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+      void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv){}
+void  __kmpc_xteamr_cf_4x64
+   (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+      void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv){}
+void  __kmpc_xteamr_i_4x64
+   (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+      void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv){}
+void  __kmpc_xteamr_ui_4x64
+   (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+      void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv){}
+void  __kmpc_xteamr_l_4x64
+   (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+      void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv){}
+void  __kmpc_xteamr_ul_4x64
+   (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+      void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv){}
+void  __kmpc_xteamr_d_8x32
+   (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double),
+      void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv){}
+void  __kmpc_xteamr_f_8x32
+   (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float),
+      void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv){}
+void  __kmpc_xteamr_cd_8x32
+   (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
+      void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv){}
+void  __kmpc_xteamr_cf_8x32
+   (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
+      void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv){}
+void  __kmpc_xteamr_i_8x32
+   (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
+      void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv){}
+void  __kmpc_xteamr_ui_8x32
+   (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
+      void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv){}
+void  __kmpc_xteamr_l_8x32
+   (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
+      void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv){}
+void  __kmpc_xteamr_ul_8x32
+   (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
+      void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv){}
+void __kmpc_rfun_sum_d(double *val, double otherval){}
+void __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval){}
+void __kmpc_rfun_sum_f(float *val, float otherval){}
+void __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval){}
+void __kmpc_rfun_sum_cd(_CD *val, _CD otherval){}
+void __kmpc_rfun_sum_lds_cd(_RF_LDS _CD *val, _RF_LDS _CD *otherval){}
+void __kmpc_rfun_sum_cf(_CF *val, _CF otherval){}
+void __kmpc_rfun_sum_lds_cf(_RF_LDS _CF *val, _RF_LDS _CF *otherval){}
+void __kmpc_rfun_sum_i(int *val, int otherval){}
+void __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval){}
+void __kmpc_rfun_sum_ui(_UI *val, _UI otherval){}
+void __kmpc_rfun_sum_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval){}
+void __kmpc_rfun_sum_l(long *val, long otherval){}
+void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval){}
+void __kmpc_rfun_sum_ul(_UL *val, _UL otherval){}
+void __kmpc_rfun_sum_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval){}
+void __kmpc_rfun_max_d(double *val, double otherval){}
+void __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval){}
+void __kmpc_rfun_max_f(float *val, float otherval){}
+void __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval){}
+void __kmpc_rfun_max_i(int *val, int otherval){}
+void __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval){}
+void __kmpc_rfun_max_ui(_UI *val, _UI otherval){}
+void __kmpc_rfun_max_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval){}
+void __kmpc_rfun_max_l(long *val, long otherval){}
+void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval){}
+void __kmpc_rfun_max_ul(_UL *val, _UL otherval){}
+void __kmpc_rfun_max_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval){}
+void __kmpc_rfun_min_d(double *val, double otherval){}
+void __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval){}
+void __kmpc_rfun_min_f(float *val, float otherval){}
+void __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval){}
+void __kmpc_rfun_min_i(int *val, int otherval){}
+void __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval){}
+void __kmpc_rfun_min_ui(_UI *val, _UI otherval){}
+void __kmpc_rfun_min_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval){}
+void __kmpc_rfun_min_l(long *val, long otherval){}
+void __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval){}
+void __kmpc_rfun_min_ul(_UL *val, _UL otherval){}
+void __kmpc_rfun_min_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval){}
+#undef _RF_LDS
+int __kmpc_get_warp_size(){
+  printf("ERROR: executing _kmpc_get_warp_size on host\n");
+  return -1;}
+} // end extern C
+
+#endif  // of definitions for host null functions
+
+// These overloaded function definitions are for this test framework 
+// (xteamr.cpp) to invoke the extern DexviceRTL helper functions.
+
+void _INLINE_ATTR_ _overload_to_extern_sum_16x64
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_16x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x64
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_16x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x64
+   (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv)
+   { __kmpc_xteamr_cd_16x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x64
+   (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv)
+   { __kmpc_xteamr_cf_16x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x64
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_16x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x64
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_16x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x64
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_16x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x64
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_16x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_32x32
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_32x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_32x32
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_32x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_32x32
+   (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv)
+   { __kmpc_xteamr_cd_32x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_32x32
+   (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv)
+   { __kmpc_xteamr_cf_32x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_32x32
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_32x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_32x32
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_32x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_32x32
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_32x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_32x32
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_32x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x64
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_8x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x64
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_8x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x64
+   (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv)
+   { __kmpc_xteamr_cd_8x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x64
+   (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv)
+   { __kmpc_xteamr_cf_8x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x64
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_8x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x64
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_8x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x64
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_8x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x64
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_8x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x32
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_16x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x32
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_16x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x32
+   (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv)
+   { __kmpc_xteamr_cd_16x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x32
+   (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv)
+   { __kmpc_xteamr_cf_16x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x32
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_16x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x32
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_16x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x32
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_16x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_16x32
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_16x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_4x64
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_4x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_4x64
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_4x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_4x64
+   (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv)
+   { __kmpc_xteamr_cd_4x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_4x64
+   (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv)
+   { __kmpc_xteamr_cf_4x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_4x64
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_4x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_4x64
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_4x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_4x64
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_4x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_4x64
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_4x64(val, rv, tvs, td,
+      __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x32
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_8x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x32
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_8x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x32
+   (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv)
+   { __kmpc_xteamr_cd_8x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x32
+   (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv)
+   { __kmpc_xteamr_cf_8x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x32
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_8x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x32
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_8x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x32
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_8x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_sum_8x32
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_8x32(val, rv, tvs, td,
+      __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_16x64
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_16x64(val, rv, tvs, td,
+      __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_16x64
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_16x64(val, rv, tvs, td,
+      __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_16x64
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_16x64(val, rv, tvs, td,
+      __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_16x64
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_16x64(val, rv, tvs, td,
+      __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_16x64
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_16x64(val, rv, tvs, td,
+      __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_16x64
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_16x64(val, rv, tvs, td,
+      __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_32x32
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_32x32(val, rv, tvs, td,
+      __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_32x32
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_32x32(val, rv, tvs, td,
+      __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_32x32
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_32x32(val, rv, tvs, td,
+      __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_32x32
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_32x32(val, rv, tvs, td,
+      __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_32x32
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_32x32(val, rv, tvs, td,
+      __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_32x32
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_32x32(val, rv, tvs, td,
+      __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_8x64
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_8x64(val, rv, tvs, td,
+      __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_8x64
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_8x64(val, rv, tvs, td,
+      __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_8x64
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_8x64(val, rv, tvs, td,
+      __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_8x64
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_8x64(val, rv, tvs, td,
+      __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_8x64
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_8x64(val, rv, tvs, td,
+      __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_8x64
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_8x64(val, rv, tvs, td,
+      __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_16x32
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_16x32(val, rv, tvs, td,
+      __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_16x32
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_16x32(val, rv, tvs, td,
+      __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_16x32
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_16x32(val, rv, tvs, td,
+      __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_16x32
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_16x32(val, rv, tvs, td,
+      __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_16x32
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_16x32(val, rv, tvs, td,
+      __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_16x32
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_16x32(val, rv, tvs, td,
+      __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_4x64
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_4x64(val, rv, tvs, td,
+      __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_4x64
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_4x64(val, rv, tvs, td,
+      __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_4x64
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_4x64(val, rv, tvs, td,
+      __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_4x64
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_4x64(val, rv, tvs, td,
+      __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_4x64
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_4x64(val, rv, tvs, td,
+      __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_4x64
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_4x64(val, rv, tvs, td,
+      __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_8x32
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_8x32(val, rv, tvs, td,
+      __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_8x32
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_8x32(val, rv, tvs, td,
+      __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_8x32
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_8x32(val, rv, tvs, td,
+      __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_8x32
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_8x32(val, rv, tvs, td,
+      __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_8x32
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_8x32(val, rv, tvs, td,
+      __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_max_8x32
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_8x32(val, rv, tvs, td,
+      __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_16x64
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_16x64(val, rv, tvs, td,
+      __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_16x64
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_16x64(val, rv, tvs, td,
+      __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_16x64
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_16x64(val, rv, tvs, td,
+      __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_16x64
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_16x64(val, rv, tvs, td,
+      __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_16x64
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_16x64(val, rv, tvs, td,
+      __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_16x64
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_16x64(val, rv, tvs, td,
+      __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_32x32
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_32x32(val, rv, tvs, td,
+      __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_32x32
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_32x32(val, rv, tvs, td,
+      __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_32x32
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_32x32(val, rv, tvs, td,
+      __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_32x32
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_32x32(val, rv, tvs, td,
+      __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_32x32
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_32x32(val, rv, tvs, td,
+      __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_32x32
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_32x32(val, rv, tvs, td,
+      __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_8x64
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_8x64(val, rv, tvs, td,
+      __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_8x64
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_8x64(val, rv, tvs, td,
+      __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_8x64
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_8x64(val, rv, tvs, td,
+      __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_8x64
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_8x64(val, rv, tvs, td,
+      __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_8x64
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_8x64(val, rv, tvs, td,
+      __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_8x64
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_8x64(val, rv, tvs, td,
+      __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_16x32
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_16x32(val, rv, tvs, td,
+      __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_16x32
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_16x32(val, rv, tvs, td,
+      __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_16x32
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_16x32(val, rv, tvs, td,
+      __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_16x32
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_16x32(val, rv, tvs, td,
+      __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_16x32
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_16x32(val, rv, tvs, td,
+      __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_16x32
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_16x32(val, rv, tvs, td,
+      __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_4x64
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_4x64(val, rv, tvs, td,
+      __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_4x64
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_4x64(val, rv, tvs, td,
+      __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_4x64
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_4x64(val, rv, tvs, td,
+      __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_4x64
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_4x64(val, rv, tvs, td,
+      __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_4x64
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_4x64(val, rv, tvs, td,
+      __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_4x64
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_4x64(val, rv, tvs, td,
+      __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_8x32
+   (double val, double *rv, double *tvs, uint32_t *td, const double iv)
+   { __kmpc_xteamr_d_8x32(val, rv, tvs, td,
+      __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_8x32
+   (float val, float *rv, float *tvs, uint32_t *td, const float iv)
+   { __kmpc_xteamr_f_8x32(val, rv, tvs, td,
+      __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_8x32
+   (int val, int *rv, int *tvs, uint32_t *td, const int iv)
+   { __kmpc_xteamr_i_8x32(val, rv, tvs, td,
+      __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_8x32
+   (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv)
+   { __kmpc_xteamr_ui_8x32(val, rv, tvs, td,
+      __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_8x32
+   (long val, long *rv, long *tvs, uint32_t *td, const long iv)
+   { __kmpc_xteamr_l_8x32(val, rv, tvs, td,
+      __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv);}
+void _INLINE_ATTR_ _overload_to_extern_min_8x32
+   (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv)
+   { __kmpc_xteamr_ul_8x32(val, rv, tvs, td,
+      __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv);}
+#undef _CD
+#undef _CF
+#undef _UI
+#undef _UL
+#undef _INLINE_ATTR_
diff --git a/openmp/libomptarget/test/xteamr/test_xteamr.cpp b/openmp/libomptarget/test/xteamr/test_xteamr.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/xteamr/test_xteamr.cpp
@@ -0,0 +1,712 @@
+//===----- test_xteamr.cpp - Test for Xteamr DeviceRTL functions ---C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// performance and functional tests for Xteamr reduction helper functions in
+// libomptarget/DeviceRTL/Xteamr.cpp
+//
+// RUN: %libomptarget-compileoptxx-run-and-check-nvptx64-nvidia-cuda
+// REQUIRES: nvptx64-nvidia-cuda
+// CHECK: ALL TESTS PASSED
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <chrono>
+#include <complex>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <omp.h>
+#include <vector>
+
+#include "test_xteamr.h"
+
+#ifndef _ARRAY_SIZE
+#define _ARRAY_SIZE 33554432
+#endif
+const uint64_t ARRAY_SIZE = _ARRAY_SIZE;
+unsigned int repeat_num_times = 12;
+unsigned int ignore_times =
+    2; // ignore this many timings first
+
+// If we know at compile time that we have 0 index with 1 stride,
+// then generate an optimized _BIG_JUMP_LOOP.
+// This test case has index 0 and stride 1, so we set this here.
+#define __OPTIMIZE_INDEX0_STRIDE1
+
+//  Extern Xteamr functions are designed for 1024, 512, and 256 thread blocks.
+//  The default here is 512.
+
+#ifndef _XTEAM_NUM_THREADS
+#define _XTEAM_NUM_THREADS 512
+#endif
+#ifndef _XTEAM_NUM_TEAMS
+#define _XTEAM_NUM_TEAMS 80
+#endif
+
+#if _XTEAM_NUM_THREADS == 1024
+#define _SUM_OVERLOAD_64_FCT _overload_to_extern_sum_16x64
+#define _SUM_OVERLOAD_32_FCT _overload_to_extern_sum_32x32
+#define _MAX_OVERLOAD_64_FCT _overload_to_extern_max_16x64
+#define _MAX_OVERLOAD_32_FCT _overload_to_extern_max_32x32
+#define _MIN_OVERLOAD_64_FCT _overload_to_extern_min_16x64
+#define _MIN_OVERLOAD_32_FCT _overload_to_extern_min_32x32
+#elif _XTEAM_NUM_THREADS == 512
+#define _SUM_OVERLOAD_64_FCT _overload_to_extern_sum_8x64
+#define _SUM_OVERLOAD_32_FCT _overload_to_extern_sum_16x32
+#define _MAX_OVERLOAD_64_FCT _overload_to_extern_max_8x64
+#define _MAX_OVERLOAD_32_FCT _overload_to_extern_max_16x32
+#define _MIN_OVERLOAD_64_FCT _overload_to_extern_min_8x64
+#define _MIN_OVERLOAD_32_FCT _overload_to_extern_min_16x32
+#elif _XTEAM_NUM_THREADS == 256
+#define _SUM_OVERLOAD_64_FCT _overload_to_extern_sum_4x64
+#define _SUM_OVERLOAD_32_FCT _overload_to_extern_sum_8x32
+#define _MAX_OVERLOAD_64_FCT _overload_to_extern_max_4x64
+#define _MAX_OVERLOAD_32_FCT _overload_to_extern_max_8x32
+#define _MIN_OVERLOAD_64_FCT _overload_to_extern_min_4x64
+#define _MIN_OVERLOAD_32_FCT _overload_to_extern_min_8x32
+#else
+#error Invalid value for _XTEAM_NUM_THREADS.  Must be 1024, 512, or 256
+#endif
+
+// Question to Dhruva, should the limiter include the stride?
+#if defined(__NVPTX__) && _XTEAM_NUM_THREADS == 1024
+       // Cuda may restrict max threads when requesting 1024, so the bigjump
+// on the inner loop depends on the actual limited number of threads
+// determined by omp_get_num_threads(). It also requires we only call
+// the helper reducer function when k is in this range. Lastly, the
+// helper function must clear (set to rnv) unused xwave values
+// before the optimized (unrolled) xwave reduction loop. See Xteamr.cpp.
+// These three things kill performance on nvidia when requested thread=1024.
+// So codegen max request of 512 threads (16x32) for nvidia GPUs.
+#define _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(nteams)                            \
+  if (k < (nteams * omp_get_num_threads()))
+#ifdef __OPTIMIZE_INDEX0_STRIDE1
+#define _BIG_JUMP_LOOP(nteams, size, stride, offset)                           \
+  for (int64_t i = k; i < size; i += nteams * omp_get_num_threads())
+#else
+#define _BIG_JUMP_LOOP(nteams, size, stride, offset)                           \
+  for (int64_t i = ((k * stride) + offset); i < size;                          \
+       i += (nteams * omp_get_num_threads() * stride))
+#endif
+#else
+       // Assume AMDGPU or NVIDIA=512|256 always gets requested number of
+       // threads.
+// So no conditional needed to limit reductions.
+#define _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(nteams)
+
+//  Format of BIG_JUMP_LOOP depends on if we optimize for 0 index 1 stride
+#if _XTEAM_NUM_THREADS == 1024
+#ifdef __OPTIMIZE_INDEX0_STRIDE1
+#define _BIG_JUMP_LOOP(nteams, size, stride, offset)                           \
+  for (int64_t i = k; i < size; i += nteams * 1024)
+#else
+#define _BIG_JUMP_LOOP(nteams, size, stride, offset)                           \
+  for (int64_t i = ((k * stride) + offset); i < size;                          \
+       i += (nteams * 1024 * stride))
+#endif
+
+#elif _XTEAM_NUM_THREADS == 512
+#ifdef __OPTIMIZE_INDEX0_STRIDE1
+#define _BIG_JUMP_LOOP(nteams, size, stride, offset)                           \
+  for (int64_t i = k; i < size; i += nteams * 512)
+#else
+#define _BIG_JUMP_LOOP(nteams, size, stride, offset)                           \
+  for (int64_t i = ((k * stride) + offset); i < size;                          \
+       i += (nteams * 512 * stride))
+#endif
+#else
+#ifdef __OPTIMIZE_INDEX0_STRIDE1
+#define _BIG_JUMP_LOOP(nteams, size, stride, offset)                           \
+  for (int64_t i = k; i < size; i += nteams * 256)
+#else
+#define _BIG_JUMP_LOOP(nteams, size, stride, offset)                           \
+  for (int64_t i = ((k * stride) + offset); i < size;                          \
+       i += (nteams * 256 * stride))
+#endif
+#endif // end  if _XTEAM_NUM_THREADS == 1024, elif, else
+#endif // if defined(__NVPTX__) && _XTEAM_NUM_THREADS == 1024 else
+
+unsigned int test_run_rc = 0;
+
+template <typename T, bool> void run_tests(const uint64_t);
+template <typename TC, typename T> void run_tests_complex(const uint64_t);
+
+int main(int argc, char *argv[]) {
+  std::cout << std::endl
+            << "TEST DOUBLE " << _XTEAM_NUM_THREADS << " THREADS" << std::endl;
+  run_tests<double, false>(ARRAY_SIZE);
+  std::cout << std::endl
+            << "TEST FLOAT " << _XTEAM_NUM_THREADS << " THREADS" << std::endl;
+  run_tests<float, false>(ARRAY_SIZE);
+  std::cout << std::endl
+            << "TEST INT " << _XTEAM_NUM_THREADS << " THREADS" << std::endl;
+  run_tests<int, true>(ARRAY_SIZE);
+  std::cout << std::endl
+            << "TEST UNSIGNED INT " << _XTEAM_NUM_THREADS << " THREADS"
+            << std::endl;
+  run_tests<unsigned, true>(ARRAY_SIZE);
+  std::cout << std::endl
+            << "TEST LONG " << _XTEAM_NUM_THREADS << " THREADS " << std::endl;
+  run_tests<long, true>(ARRAY_SIZE);
+  std::cout << std::endl
+            << "TEST UNSIGNED LONG " << _XTEAM_NUM_THREADS << " THREADS"
+            << std::endl;
+  run_tests<unsigned long, true>(ARRAY_SIZE);
+  std::cout << std::endl
+            << "TEST DOUBLE COMPLEX " << _XTEAM_NUM_THREADS << " THREADS"
+            << std::endl;
+  run_tests_complex<double _Complex, double>(ARRAY_SIZE);
+  std::cout << std::endl
+            << "TEST FLOAT COMPLEX " << _XTEAM_NUM_THREADS << " THREADS"
+            << std::endl;
+  run_tests_complex<float _Complex, float>(ARRAY_SIZE);
+  if (test_run_rc == 0)
+    printf("ALL TESTS PASSED\n");
+  return test_run_rc;
+}
+
+template <typename T> T omp_dot(T *a, T *b, uint64_t array_size) {
+  T sum = 0.0;
+#pragma omp target teams distribute parallel for map(tofrom: sum) reduction(+:sum)
+  for (int64_t i = 0; i < array_size; i++)
+    sum += a[i] * b[i];
+  return sum;
+}
+
+template <typename T> T omp_max(T *c, uint64_t array_size) {
+  T maxval = std::numeric_limits<T>::lowest();
+#pragma omp target teams distribute parallel for map(tofrom                    \
+                                                     : maxval)                 \
+    reduction(max                                                              \
+              : maxval)
+  for (int64_t i = 0; i < array_size; i++)
+    maxval = (c[i] > maxval) ? c[i] : maxval;
+  return maxval;
+}
+
+template <typename T> T omp_min(T *c, uint64_t array_size) {
+  T minval = std::numeric_limits<T>::max();
+#pragma omp target teams distribute parallel for map(tofrom                    \
+                                                     : minval)                 \
+    reduction(min                                                              \
+              : minval)
+  for (int64_t i = 0; i < array_size; i++) {
+    minval = (c[i] < minval) ? c[i] : minval;
+  }
+  return minval;
+}
+
+template <typename T> T sim_dot(T *a, T *b, int warp_size) {
+  T sum = T(0);
+  int devid = 0;
+  struct loop_ctl_t {
+    uint32_t *td_ptr;         // Atomic counter accessed on device
+    uint32_t reserved;        // reserved
+    const int64_t stride = 1; // stride to process input vectors
+    const int64_t offset = 0; // Offset to initial index of input vectors
+    const int64_t size = _ARRAY_SIZE; // Size of input vector
+    const T rnv = T(0);               // reduction null value
+    T *team_vals;                     // array of global team values
+  };
+  static uint32_t zero = 0;
+  static loop_ctl_t lc0;
+  static int64_t num_teams0 = 0;
+  if (!num_teams0) {
+    // num_teams0    = ompx_get_device_num_units(devid);
+    num_teams0 = _XTEAM_NUM_TEAMS;
+    lc0.td_ptr = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid);
+    lc0.team_vals = (T *)omp_target_alloc(sizeof(T) * num_teams0, devid);
+    omp_target_memcpy(lc0.td_ptr, &zero, sizeof(uint32_t), 0, 0, devid,
+                      omp_get_initial_device());
+  }
+
+  if (warp_size == 64) {
+#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS)   \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom:sum) map(to:lc0)
+    for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val0 = lc0.rnv;
+      _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset)
+        val0 += a[i] * b[i];
+      _SUM_OVERLOAD_64_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv);
+    }
+  } else {
+#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS)   \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom:sum) map(to:lc0)
+    for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val0 = lc0.rnv;
+      _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset)
+        val0 += a[i] * b[i];
+      _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS)
+      _SUM_OVERLOAD_32_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv);
+    }
+  }
+  return sum;
+}
+
+template <typename T> T sim_max(T *c, int warp_size) {
+  T retval = std::numeric_limits<T>::lowest();
+  int devid = 0;
+  struct loop_ctl_t {
+    uint32_t *td_ptr;                 // Atomic counter accessed on device
+    uint32_t reserved;                // reserved
+    const int64_t stride = 1;         // stride to process input vectors
+    const int64_t offset = 0;         // Offset to index of input vectors
+    const int64_t size = _ARRAY_SIZE; // Size of input vectors
+    T rnv;                            // reduction null value
+    T *team_vals;                     // array of global team values
+  };
+  static uint32_t zero = 0;
+  static loop_ctl_t lc1;
+  static int64_t num_teams1 = 0;
+  if (!num_teams1) {
+    // num_teams1    = ompx_get_device_num_units(devid);
+    num_teams1 = _XTEAM_NUM_TEAMS;
+    lc1.td_ptr = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid);
+    lc1.rnv = std::numeric_limits<T>::lowest();
+    lc1.team_vals = (T *)omp_target_alloc(sizeof(T) * num_teams1, devid);
+    omp_target_memcpy(lc1.td_ptr, &zero, sizeof(uint32_t), 0, 0, devid,
+                      omp_get_initial_device());
+  }
+  if (warp_size == 64) {
+#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS)   \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom:retval) map(to:lc1)
+    for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val1 = lc1.rnv;
+      _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset)
+      val1 = (c[i] > val1) ? c[i] : val1;
+      _MAX_OVERLOAD_64_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv);
+    }
+  } else {
+#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS)   \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom:retval) map(to:lc1)
+    for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val1 = lc1.rnv;
+      _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset)
+      val1 = (c[i] > val1) ? c[i] : val1;
+      _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS)
+      _MAX_OVERLOAD_32_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv);
+    }
+  }
+  return retval;
+}
+
+template <typename T> T sim_min(T *c, int warp_size) {
+  T retval = std::numeric_limits<T>::max();
+  int devid = 0;
+  struct loop_ctl_t {
+    uint32_t *td_ptr;         // Atomic counter accessed on device
+    uint32_t reserved;        // reserved
+    const int64_t stride = 1; // stride to process input vectors
+    const int64_t offset = 0; // Offset to initial index of input vectors
+    const int64_t size = _ARRAY_SIZE; // Size of input vectors
+    T rnv;                            // reduction null value
+    T *team_vals;                     // array of global team values
+  };
+  static uint32_t zero = 0;
+  static loop_ctl_t lc2;
+  static int64_t num_teams2 = 0;
+  if (!num_teams2) {
+    // num_teams2    = ompx_get_device_num_units(devid);
+    num_teams2 = _XTEAM_NUM_TEAMS;
+    lc2.td_ptr = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid);
+    lc2.rnv = std::numeric_limits<T>::max();
+    lc2.team_vals = (T *)omp_target_alloc(sizeof(T) * num_teams2, devid);
+    omp_target_memcpy(lc2.td_ptr, &zero, sizeof(uint32_t), 0, 0, devid,
+                      omp_get_initial_device());
+  }
+  if (warp_size == 64) {
+#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS)   \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom:retval) map(to:lc2)
+    for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val2 = lc2.rnv;
+      _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset)
+      val2 = (c[i] < val2) ? c[i] : val2;
+      _MIN_OVERLOAD_64_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv);
+    }
+  } else {
+#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS)   \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom:retval) map(to:lc2)
+    for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val2 = lc2.rnv;
+      _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset)
+      val2 = (c[i] < val2) ? c[i] : val2;
+      _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS)
+      _MIN_OVERLOAD_32_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv);
+    }
+  }
+  return retval;
+}
+
+template <typename T, bool DATA_TYPE_IS_INT>
+void _check_val(T computed_val, T gold_val, const char *msg) {
+  double ETOL = 0.0000001;
+  if (DATA_TYPE_IS_INT) {
+    if (computed_val != gold_val) {
+      std::cerr << msg << " FAIL "
+                << "Integar Value was " << computed_val << " but should be "
+                << gold_val << std::endl;
+      test_run_rc = 1;
+    }
+  } else {
+    double dcomputed_val = (double)computed_val;
+    double dvalgold = (double)gold_val;
+    double ompErrSum = abs((dcomputed_val - dvalgold) / dvalgold);
+    if (ompErrSum > ETOL) {
+      std::cerr << msg << " FAIL " << ompErrSum << " tol:" << ETOL << std::endl
+                << std::setprecision(15) << "Value was " << computed_val
+                << " but should be " << gold_val << std::endl;
+      test_run_rc = 1;
+    }
+  }
+}
+
+#define ALIGNMENT (2 * 1024 * 1024)
+
+template <typename T, bool DATA_TYPE_IS_INT>
+void run_tests(uint64_t array_size) {
+
+  // FIXME: How do we get warpsize of a device from host?
+  int warp_size = 64;
+#pragma omp target map(tofrom : warp_size)
+  warp_size = __kmpc_get_warp_size();
+
+  //  Align on 2M boundaries
+  T *a = (T *)aligned_alloc(ALIGNMENT, sizeof(T) * array_size);
+  T *b = (T *)aligned_alloc(ALIGNMENT, sizeof(T) * array_size);
+  T *c = (T *)aligned_alloc(ALIGNMENT, sizeof(T) * array_size);
+#pragma omp target enter data map(alloc:a[0:array_size], b[0:array_size],      \
+                                    c[0:array_size])
+#pragma omp target teams distribute parallel for
+  for (int64_t i = 0; i < array_size; i++) {
+    a[i] = 2;
+    b[i] = 3;
+    c[i] = (i + 1);
+  }
+
+  std::cout << "Running kernels " << repeat_num_times << " times" << std::endl;
+  std::cout << "Ignoring timing of first " << ignore_times << "  runs "
+            << std::endl;
+
+  double ETOL = 0.0000001;
+  if (DATA_TYPE_IS_INT) {
+    std::cout << "Integer Size: " << sizeof(T) << std::endl;
+  } else {
+    if (sizeof(T) == sizeof(float))
+      std::cout << "Precision: float" << std::endl;
+    else
+      std::cout << "Precision: double" << std::endl;
+  }
+
+  std::cout << "Warp size:" << warp_size << std::endl;
+  // int num_teams = ompx_get_device_num_units(omp_get_default_device());
+  int num_teams = _XTEAM_NUM_TEAMS;
+  std::cout << "Array elements: " << array_size << std::endl;
+  std::cout << "Array size:     " << ((array_size * sizeof(T)) / (1024 * 1024))
+            << " MB" << std::endl;
+
+  T goldDot = (T)6 * (T)array_size;
+  T goldMax = (T)array_size;
+  T goldMin = (T)1;
+
+  double goldDot_d = (double)goldDot;
+  double goldMax_d = (double)goldMax;
+  double goldMin_d = (double)goldMin;
+
+  // List of times
+  std::vector<std::vector<double>> timings(6);
+
+  // Declare timers
+  std::chrono::high_resolution_clock::time_point t1, t2;
+
+  // Timing loop
+  for (unsigned int k = 0; k < repeat_num_times; k++) {
+    t1 = std::chrono::high_resolution_clock::now();
+    T omp_sum = omp_dot<T>(a, b, array_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[0].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val<T, DATA_TYPE_IS_INT>(omp_sum, goldDot, "omp_dot");
+
+    t1 = std::chrono::high_resolution_clock::now();
+    T sim_sum = sim_dot<T>(a, b, warp_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[1].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val<T, DATA_TYPE_IS_INT>(sim_sum, goldDot, "sim_dot");
+
+    t1 = std::chrono::high_resolution_clock::now();
+    T omp_max_val = omp_max<T>(c, array_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[2].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val<T, DATA_TYPE_IS_INT>(omp_max_val, goldMax, "omp_max");
+
+    t1 = std::chrono::high_resolution_clock::now();
+    T sim_max_val = sim_max<T>(c, warp_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[3].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val<T, DATA_TYPE_IS_INT>(sim_max_val, goldMax, "sim_max");
+
+    t1 = std::chrono::high_resolution_clock::now();
+    T omp_min_val = omp_min<T>(c, array_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[4].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val<T, DATA_TYPE_IS_INT>(omp_min_val, goldMin, "omp_min");
+
+    t1 = std::chrono::high_resolution_clock::now();
+    T sim_min_val = sim_min<T>(c, warp_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[5].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val<T, DATA_TYPE_IS_INT>(sim_min_val, goldMin, "sim_min");
+
+  } // end Timing loop
+
+  // Display timing results
+  std::cout << std::left << std::setw(12) << "Function" << std::left
+            << std::setw(12) << "Best-MB/sec" << std::left << std::setw(12)
+            << " Min (sec)" << std::left << std::setw(12) << "   Max"
+            << std::left << std::setw(12) << "Average" << std::left
+            << std::setw(12) << "Avg-MB/sec" << std::endl;
+
+  std::cout << std::fixed;
+
+  std::string labels[6] = {"ompdot", "simdot", "ompmax",
+                           "simmax", "ompmin", "simmin"};
+  size_t sizes[6] = {2 * sizeof(T) * array_size, 2 * sizeof(T) * array_size,
+                     1 * sizeof(T) * array_size, 1 * sizeof(T) * array_size,
+                     1 * sizeof(T) * array_size, 1 * sizeof(T) * array_size};
+
+  for (int i = 0; i < 6; i++) {
+    // Get min/max; ignore the first couple results
+    auto minmax = std::minmax_element(timings[i].begin() + ignore_times,
+                                      timings[i].end());
+    // Calculate average; ignore ignore_times
+    double average = std::accumulate(timings[i].begin() + ignore_times,
+                                     timings[i].end(), 0.0) /
+                     (double)(repeat_num_times - ignore_times);
+    printf("  %s       %8.0f   %8.6f  %8.6f   %8.6f    %8.0f\n",
+           labels[i].c_str(), 1.0E-6 * sizes[i] / (*minmax.first),
+           (double)*minmax.first, (double)*minmax.second, (double)average,
+           1.0E-6 * sizes[i] / (average));
+  }
+#pragma omp target exit data map(release: a[0:array_size], b[0:array_size],    \
+                                   c[0:array_size])
+  free(a);
+  free(b);
+  free(c);
+}
+
+
+template <typename TC, typename T>
+void _check_val_complex(TC computed_val_complex, TC gold_val_complex,
+                        const char *msg) {
+  T gold_val_r = __real__(gold_val_complex);
+  T computed_val_r = __real__(computed_val_complex);
+  T gold_val_i = __imag__(gold_val_complex);
+  T computed_val_i = __imag__(computed_val_complex);
+  double ETOL = 0.0000001;
+  double computed_val_r_d = (double)computed_val_r;
+  double valgold_r_d = (double)gold_val_r;
+  double ompErrSum_r = abs((computed_val_r_d - valgold_r_d) / valgold_r_d);
+  double computed_val_i_d = (double)computed_val_i;
+  double valgold_i_d = (double)gold_val_i;
+  double ompErrSum_i = abs((computed_val_i_d - valgold_i_d) / valgold_i_d);
+  if ((ompErrSum_r > ETOL) || (ompErrSum_i > ETOL)) {
+    std::cerr << msg << " FAIL " << ompErrSum_r << " tol:" << ETOL << std::endl
+              << std::setprecision(15) << "Value was (" << computed_val_r
+              << " + " << computed_val_i << " i )" << std::endl
+              << " but should be (" << gold_val_r << " + " << gold_val_i
+              << "i) " << std::endl;
+    test_run_rc = 1;
+  }
+}
+
+template <typename TC> TC omp_dot_complex(TC *a, TC *b, uint64_t array_size) {
+  TC dot;
+  __real__(dot) = 0.0;
+  __imag__(dot) = 0.0;
+#pragma omp target teams distribute parallel for map(tofrom: dot) reduction(+:dot)
+  for (int64_t i = 0; i < array_size; i++)
+    dot += a[i] * b[i];
+  return dot;
+}
+
+template <typename T> T sim_dot_complex(T *a, T *b, int warp_size) {
+  int devid = 0;
+  T zero_c;
+  __real__(zero_c) = 0.0;
+  __imag__(zero_c) = 0.0;
+  struct loop_ctl_t {
+    uint32_t *td_ptr;         // Atomic counter accessed on device
+    uint32_t reserved;        // reserved
+    const int64_t stride = 1; // stride to process input vectors
+    const int64_t offset = 0; // Offset to initial index of input vectors
+    const int64_t size = _ARRAY_SIZE; // Size of input vectors
+    T rnv;                            // reduction null value
+    T *team_vals;                     // array of global team values
+  };
+  T sum = zero_c;
+  uint32_t zero = 0;
+  static loop_ctl_t lc3;
+  static int64_t num_teams3 = 0;
+  if (!num_teams3) {
+    // num_teams3    = ompx_get_device_num_units(devid);
+    num_teams3 = _XTEAM_NUM_TEAMS;
+    lc3.td_ptr = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid);
+    lc3.team_vals = (T *)omp_target_alloc(sizeof(T) * num_teams3, devid);
+    lc3.rnv = zero_c;
+    omp_target_memcpy(lc3.td_ptr, &zero, sizeof(uint32_t), 0, 0, devid,
+                      omp_get_initial_device());
+  }
+
+  if (warp_size == 64) {
+#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS)   \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom                                 \
+                                        : sum) map(to                          \
+                                                   : lc3)
+    for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val3 = lc3.rnv;
+      _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc3.size, lc3.stride, lc3.offset)
+      val3 += a[i] * b[i];
+      _SUM_OVERLOAD_64_FCT(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv);
+    }
+  } else {
+#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS)   \
+    num_threads(_XTEAM_NUM_THREADS) map(tofrom                                 \
+                                        : sum) map(to                          \
+                                                   : lc3)
+    for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
+      T val3 = lc3.rnv;
+      _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc3.size, lc3.stride, lc3.offset)
+      val3 += a[i] * b[i];
+      _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS)
+      _SUM_OVERLOAD_32_FCT(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv);
+    }
+  }
+  return sum;
+}
+
+template <typename TC, typename T>
+void run_tests_complex(const uint64_t array_size) {
+
+  // FIXME: How do we get warpsize of a device from host?
+  int warp_size = 64;
+#pragma omp target map(tofrom : warp_size)
+  warp_size = __kmpc_get_warp_size();
+
+  TC *a = (TC *)aligned_alloc(ALIGNMENT, sizeof(TC) * array_size);
+  TC *b = (TC *)aligned_alloc(ALIGNMENT, sizeof(TC) * array_size);
+
+#pragma omp target enter data map(alloc : a [0:array_size], b [0:array_size])
+  TC startA;
+  __real__(startA) = 1.0;
+  __imag__(startA) = 1.0;
+  TC startB;
+  __real__(startB) = 1.0;
+  __imag__(startB) = -1.0;
+
+#pragma omp target teams distribute parallel for
+  for (int64_t i = 0; i < array_size; i++) {
+    a[i] = startA;
+    b[i] = startB;
+    // a[i] * b[i] = 2 + 0i
+  }
+
+  std::cout << "Running kernels " << repeat_num_times << " times" << std::endl;
+  std::cout << "Ignoring timing of first " << ignore_times << "  runs "
+            << std::endl;
+
+  double ETOL = 0.0000001;
+  if (sizeof(TC) == sizeof(float _Complex))
+    std::cout << "Precision: float _Complex" << std::endl;
+  else
+    std::cout << "Precision: double _Complex" << std::endl;
+
+  std::cout << "Warp size:" << warp_size << std::endl;
+  std::cout << "Array elements: " << array_size << std::endl;
+  std::cout << "Array size:     " << ((array_size * sizeof(TC)) / (1024 * 1024))
+            << " MB" << std::endl;
+
+  T goldDotr = T(2) * (T)array_size;
+  T goldDoti = T(0);
+
+  TC goldDot;
+  __real__(goldDot) = goldDotr;
+  __imag__(goldDot) = goldDoti;
+
+  // List of times
+  std::vector<std::vector<double>> timings(2);
+
+  // Declare timers
+  std::chrono::high_resolution_clock::time_point t1, t2;
+
+  // timing loop
+  for (unsigned int k = 0; k < repeat_num_times; k++) {
+    t1 = std::chrono::high_resolution_clock::now();
+    TC omp_sum = omp_dot_complex<TC>(a, b, array_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[0].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val_complex<TC, T>(omp_sum, goldDot, "omp_dot");
+
+    t1 = std::chrono::high_resolution_clock::now();
+    TC sim_sum = sim_dot_complex<TC>(a, b, warp_size);
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[1].push_back(
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
+            .count());
+    _check_val_complex<TC, T>(sim_sum, goldDot, "sim_dot");
+
+  } // end timing loop
+
+  // Display timing results
+  std::cout << std::left << std::setw(12) << "Function" << std::left
+            << std::setw(12) << "Best-MB/sec" << std::left << std::setw(12)
+            << " Min (sec)" << std::left << std::setw(12) << "   Max"
+            << std::left << std::setw(12) << "Average" << std::left
+            << std::setw(12) << "Avg-MB/sec" << std::endl;
+
+  std::cout << std::fixed;
+
+  std::string labels[2] = {"ompdot", "simdot"};
+  size_t sizes[2] = {2 * sizeof(TC) * array_size, 2 * sizeof(TC) * array_size};
+
+  for (int i = 0; i < 2; i++) {
+    // Get min/max; ignore the first couple results
+    auto minmax = std::minmax_element(timings[i].begin() + ignore_times,
+                                      timings[i].end());
+
+    // Calculate average; ignore ignore_times
+    double average = std::accumulate(timings[i].begin() + ignore_times,
+                                     timings[i].end(), 0.0) /
+                     (double)(repeat_num_times - ignore_times);
+
+    printf("  %s       %8.0f   %8.6f  %8.6f   %8.6f    %8.0f\n",
+           labels[i].c_str(), 1.0E-6 * sizes[i] / (*minmax.first),
+           (double)*minmax.first, (double)*minmax.second, (double)average,
+           1.0E-6 * sizes[i] / (average));
+  }
+#pragma omp target exit data map(release : a [0:array_size], b [0:array_size])
+  free(a);
+  free(b);
+}
diff --git a/openmp/libomptarget/test/xteamr/test_xteamr.sh b/openmp/libomptarget/test/xteamr/test_xteamr.sh
new file mode 100755
--- /dev/null
+++ b/openmp/libomptarget/test/xteamr/test_xteamr.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+#== overload_insts_1024.h overloaded instatiations of Xteamr fcts -C++ -*-===//
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===----------------------------------------------------------------------===//
+# 
+#  test_xteamr.sh: Script to test high performance reduction helper functions
+#                  in llvm-project/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp
+#                  It compiles and executes test_xteamr.cpp in 3 configs.
+#                  1024 device threads, 512 dev threads, and 256 dev threads.
+#
+#  See README file in this directory for more information.
+#
+#===----------------------------------------------------------------------===//
+
+LLVM_INSTALL=${LLVM_INSTALL:-$HOME/llvm}
+[ ! -f $LLVM_INSTALL/bin/clang ] && echo "ERROR: no LLVM install at $LLVM_INSTALL" && exit 1
+
+OFFLOAD_ARCH=${OFFLOAD_ARCH:-sm_70}
+
+tmpdir=/tmp/$USER/xteamr && mkdir -p $tmpdir
+[ ! -d $tmpdir ] && echo "ERROR: could not create $tmpdir"
+
+ARRAY_SIZE=${ARRAY_SIZE:-41943040}
+#ARRAY_SIZE=${ARRAY_SIZE:-33554432}
+as_arg="-D_ARRAY_SIZE=$ARRAY_SIZE"
+
+NUM_TEAMS=${NUM_TEAMS:-80}
+
+cuda_args=""
+CUDA_INSTALL=${CUDA_INSTALL:-/usr/local/cuda}
+[ -d $CUDA_INSTALL ] && cudalib=$CUDA_INSTALL/targets/x86_64-linux/lib && export LD_LIBRARY_PATH=$cudalib && cuda_args="-L$cudalib -lcudart"
+
+nt_args="-D_XTEAM_NUM_THREADS=1024 -D_XTEAM_NUM_TEAMS=$NUM_TEAMS"
+echo " COMPILE with --offload-arch=$OFFLOAD_ARCH $as_arg $nt_args"
+$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg $nt_args -fopenmp --offload-arch=$OFFLOAD_ARCH test_xteamr.cpp -o $tmpdir/xteamr_1024 $cuda_args -lstdc++ -latomic
+rc1=$?
+
+nt_args="-D_XTEAM_NUM_THREADS=512 -D_XTEAM_NUM_TEAMS=$NUM_TEAMS"
+echo " COMPILE with --offload-arch=$OFFLOAD_ARCH $as_arg $nt_args"
+$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg $nt_args -fopenmp --offload-arch=$OFFLOAD_ARCH test_xteamr.cpp -o $tmpdir/xteamr_512 $cuda_args -lstdc++ -latomic
+rc2=$?
+
+nt_args="-D_XTEAM_NUM_THREADS=256 -D_XTEAM_NUM_TEAMS=$NUM_TEAMS"
+echo " COMPILE with --offload-arch=$OFFLOAD_ARCH $as_arg $nt_args"
+$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg $nt_args -fopenmp --offload-arch=$OFFLOAD_ARCH test_xteamr.cpp -o $tmpdir/xteamr_256 $cuda_args -lstdc++ -latomic
+rc3=$?
+
+[ $rc1 == 0 ] && echo "START EXECUTE xteamr_1024" && $tmpdir/xteamr_1024 > $tmpdir/xteamr_1024.out
+rc4=$?
+[ $rc2 == 0 ] && echo "START EXECUTE xteamr_512" && $tmpdir/xteamr_512 > $tmpdir/xteamr_512.out
+rc5=$?
+[ $rc3 == 0 ] && echo "START EXECUTE xteamr_256" && $tmpdir/xteamr_256 > $tmpdir/xteamr_256.out
+rc6=$?
+
+echo 
+rc=$(( $rc1 + $rc2 + $rc3 + $rc4 + $rc5 + $rc6 ))
+if [ $rc != 0 ] ; then 
+  echo "ERRORS DETECTED!"
+else
+  echo "No errors detected"
+fi
+echo "Logs and binaries saved to $tmpdir"
+exit $rc