diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -95,6 +95,7 @@ ${include_directory}/Synchronization.h ${include_directory}/Types.h ${include_directory}/Utils.h + ${include_directory}/Xteamr.h ) set(src_files @@ -111,6 +112,7 @@ ${source_directory}/Tasking.cpp ${source_directory}/Utils.cpp ${source_directory}/Workshare.cpp + ${source_directory}/Xteamr.cpp ) set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512) diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -13,6 +13,7 @@ #define OMPTARGET_DEVICERTL_INTERFACE_H #include "Types.h" +#include "Xteamr.h" /// External API /// diff --git a/openmp/libomptarget/DeviceRTL/include/Xteamr.h b/openmp/libomptarget/DeviceRTL/include/Xteamr.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Xteamr.h @@ -0,0 +1,250 @@ +//===---------------- Xteamr.h - OpenMP interface ----------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Modifications Copyright (c) 2022 Advanced Micro Devices, All rights reserved. +// Notified per clause 4(b) of the license. +// +//===----------------------------------------------------------------------===// +// +// DeviceRTL Header file: Xteamr.h +// External __kmpc headers for cross team reduction functions defined +// in DeviceRTL/src/Xteamr.cpp. Clang generates a call to one of these +// functions when it encounter a reduction. The specific function depends +// on datatype, warpsize, and number of waves in the teamsize. The number +// of waves must be a power of 2 and the total number of threads must +// be greater than or equal to the number of teams. +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_XTEAMR_H +#define OMPTARGET_DEVICERTL_XTEAMR_H +#include "Types.h" + +#define _CD double _Complex +#define _CF float _Complex +#define _UI unsigned int +#define _UL unsigned long +#define _INLINE_ATTR_ __attribute__((flatten, always_inline)) +#define _RF_LDS volatile __attribute__((address_space(3))) + +/// __kmpc_xteamr__x: cross team reduction functions +/// letter(s) representing data type +/// number of warps/waves in thread block +/// warp size, so x WSZ is the team size in threads +/// example: 16x64 is a teamsize = 1024 threads +/// arg1: the thread local reduction value. +/// arg2: pointer to where result is to be written. +/// arg3: global array of team values for this reduction instance. +/// arg4: pointer to atomic counter of completed teams +/// arg5: function pointer to reduction type function (sum,min,max) +/// arg6: function pointer to reduction type function on LDS memory +/// arg7: Reduction null value +extern "C" { +void _INLINE_ATTR_ __kmpc_xteamr_d_16x64( + double v, double *r_ptr, double *tvs, uint32_t *td, + void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv); +void _INLINE_ATTR_ __kmpc_xteamr_f_16x64( + float v, float *r_ptr, float *tvs, uint32_t *td, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv); +void _INLINE_ATTR_ __kmpc_xteamr_cd_16x64( + _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv); +void _INLINE_ATTR_ __kmpc_xteamr_cf_16x64( + _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv); +void _INLINE_ATTR_ __kmpc_xteamr_i_16x64( + int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv); +void _INLINE_ATTR_ __kmpc_xteamr_ui_16x64( + _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv); +void _INLINE_ATTR_ __kmpc_xteamr_l_16x64( + long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv); +void _INLINE_ATTR_ __kmpc_xteamr_ul_16x64( + _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv); +void _INLINE_ATTR_ __kmpc_xteamr_d_32x32( + double v, double *r_ptr, double *tvs, uint32_t *td, + void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv); +void _INLINE_ATTR_ __kmpc_xteamr_f_32x32( + float v, float *r_ptr, float *tvs, uint32_t *td, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv); +void _INLINE_ATTR_ __kmpc_xteamr_cd_32x32( + _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv); +void _INLINE_ATTR_ __kmpc_xteamr_cf_32x32( + _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv); +void _INLINE_ATTR_ __kmpc_xteamr_i_32x32( + int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv); +void _INLINE_ATTR_ __kmpc_xteamr_ui_32x32( + _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv); +void _INLINE_ATTR_ __kmpc_xteamr_l_32x32( + long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv); +void _INLINE_ATTR_ __kmpc_xteamr_ul_32x32( + _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv); +void _INLINE_ATTR_ __kmpc_xteamr_d_8x64( + double v, double *r_ptr, double *tvs, uint32_t *td, + void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv); +void _INLINE_ATTR_ __kmpc_xteamr_f_8x64( + float v, float *r_ptr, float *tvs, uint32_t *td, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv); +void _INLINE_ATTR_ __kmpc_xteamr_cd_8x64( + _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv); +void _INLINE_ATTR_ __kmpc_xteamr_cf_8x64( + _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv); +void _INLINE_ATTR_ __kmpc_xteamr_i_8x64( + int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv); +void _INLINE_ATTR_ __kmpc_xteamr_ui_8x64( + _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv); +void _INLINE_ATTR_ __kmpc_xteamr_l_8x64( + long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv); +void _INLINE_ATTR_ __kmpc_xteamr_ul_8x64( + _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv); +void _INLINE_ATTR_ __kmpc_xteamr_d_16x32( + double v, double *r_ptr, double *tvs, uint32_t *td, + void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv); +void _INLINE_ATTR_ __kmpc_xteamr_f_16x32( + float v, float *r_ptr, float *tvs, uint32_t *td, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv); +void _INLINE_ATTR_ __kmpc_xteamr_cd_16x32( + _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv); +void _INLINE_ATTR_ __kmpc_xteamr_cf_16x32( + _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv); +void _INLINE_ATTR_ __kmpc_xteamr_i_16x32( + int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv); +void _INLINE_ATTR_ __kmpc_xteamr_ui_16x32( + _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv); +void _INLINE_ATTR_ __kmpc_xteamr_l_16x32( + long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv); +void _INLINE_ATTR_ __kmpc_xteamr_ul_16x32( + _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv); +void _INLINE_ATTR_ __kmpc_xteamr_d_4x64( + double v, double *r_ptr, double *tvs, uint32_t *td, + void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv); +void _INLINE_ATTR_ __kmpc_xteamr_f_4x64( + float v, float *r_ptr, float *tvs, uint32_t *td, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv); +void _INLINE_ATTR_ __kmpc_xteamr_cd_4x64( + _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv); +void _INLINE_ATTR_ __kmpc_xteamr_cf_4x64( + _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv); +void _INLINE_ATTR_ __kmpc_xteamr_i_4x64( + int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv); +void _INLINE_ATTR_ __kmpc_xteamr_ui_4x64( + _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv); +void _INLINE_ATTR_ __kmpc_xteamr_l_4x64( + long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv); +void _INLINE_ATTR_ __kmpc_xteamr_ul_4x64( + _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv); +void _INLINE_ATTR_ __kmpc_xteamr_d_8x32( + double v, double *r_ptr, double *tvs, uint32_t *td, + void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv); +void _INLINE_ATTR_ __kmpc_xteamr_f_8x32( + float v, float *r_ptr, float *tvs, uint32_t *td, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv); +void _INLINE_ATTR_ __kmpc_xteamr_cd_8x32( + _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv); +void _INLINE_ATTR_ __kmpc_xteamr_cf_8x32( + _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv); +void _INLINE_ATTR_ __kmpc_xteamr_i_8x32( + int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv); +void _INLINE_ATTR_ __kmpc_xteamr_ui_8x32( + _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv); +void _INLINE_ATTR_ __kmpc_xteamr_l_8x32( + long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv); +void _INLINE_ATTR_ __kmpc_xteamr_ul_8x32( + _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv); +void __kmpc_rfun_sum_d(double *val, double otherval); +void __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval); +void __kmpc_rfun_sum_f(float *val, float otherval); +void __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval); +void __kmpc_rfun_sum_cd(_CD *val, _CD otherval); +void __kmpc_rfun_sum_lds_cd(_RF_LDS _CD *val, _RF_LDS _CD *otherval); +void __kmpc_rfun_sum_cf(_CF *val, _CF otherval); +void __kmpc_rfun_sum_lds_cf(_RF_LDS _CF *val, _RF_LDS _CF *otherval); +void __kmpc_rfun_sum_i(int *val, int otherval); +void __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval); +void __kmpc_rfun_sum_ui(_UI *val, _UI otherval); +void __kmpc_rfun_sum_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval); +void __kmpc_rfun_sum_l(long *val, long otherval); +void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); +void __kmpc_rfun_sum_ul(_UL *val, _UL otherval); +void __kmpc_rfun_sum_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval); +void __kmpc_rfun_max_d(double *val, double otherval); +void __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval); +void __kmpc_rfun_max_f(float *val, float otherval); +void __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval); +void __kmpc_rfun_max_i(int *val, int otherval); +void __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval); +void __kmpc_rfun_max_ui(_UI *val, _UI otherval); +void __kmpc_rfun_max_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval); +void __kmpc_rfun_max_l(long *val, long otherval); +void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); +void __kmpc_rfun_max_ul(_UL *val, _UL otherval); +void __kmpc_rfun_max_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval); +void __kmpc_rfun_min_d(double *val, double otherval); +void __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval); +void __kmpc_rfun_min_f(float *val, float otherval); +void __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval); +void __kmpc_rfun_min_i(int *val, int otherval); +void __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval); +void __kmpc_rfun_min_ui(_UI *val, _UI otherval); +void __kmpc_rfun_min_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval); +void __kmpc_rfun_min_l(long *val, long otherval); +void __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); +void __kmpc_rfun_min_ul(_UL *val, _UL otherval); +void __kmpc_rfun_min_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval); +} // end extern C + +#undef _CD +#undef _CF +#undef _UI +#undef _UL +#undef _INLINE_ATTR_ +#undef _RF_LDS + +#endif // of ifndef OMPTARGET_DEVICERTL_XTEAMR_H diff --git a/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp b/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp @@ -0,0 +1,851 @@ +//===---- Xteamr.cpp - OpenMP cross team helper functions ---- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains helper functions for cross team reductions +// +//===----------------------------------------------------------------------===// + +#include "Xteamr.h" +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "State.h" +#include "Synchronization.h" +#include "Types.h" +#include "Utils.h" + +#define __XTEAM_SHARED_LDS volatile __attribute__((address_space(3))) + +using namespace _OMP; + +#pragma omp begin declare target device_type(nohost) + +// Headers for specialized shfl_xor +double xteamr_shfl_xor_d(double var, const int lane_mask, const uint32_t width); +float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width); +int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width); +double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask, + const uint32_t width); +float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask, + const uint32_t width); + +// Define the arch (amdgcn vs nvptx) variants of shfl + +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) { + int self = mapping::getThreadIdInWarp(); // __lane_id(); + int index = self ^ lane_mask; + index = index >= ((self + width) & ~(width - 1)) ? self : index; + return __builtin_amdgcn_ds_bpermute(index << 2, var); +} +float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width) { + union { + int i; + unsigned u; + float f; + } tmp; + tmp.f = var; + tmp.i = xteamr_shfl_xor_int(tmp.i, lane_mask, width); + return tmp.f; +} +double xteamr_shfl_xor_d(double var, const int lane_mask, + const uint32_t width) { + static_assert(sizeof(double) == 2 * sizeof(int), ""); + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = xteamr_shfl_xor_int(tmp[0], lane_mask, width); + tmp[1] = xteamr_shfl_xor_int(tmp[1], lane_mask, width); + + uint64_t tmp0 = + (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + double tmp1; + __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} + +double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask, + const uint32_t width) { + __real__(var) = xteamr_shfl_xor_d(__real__(var), lane_mask, width); + __imag__(var) = xteamr_shfl_xor_d(__imag__(var), lane_mask, width); + return var; +} +float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask, + const uint32_t width) { + __real__(var) = xteamr_shfl_xor_f(__real__(var), lane_mask, width); + __imag__(var) = xteamr_shfl_xor_f(__imag__(var), lane_mask, width); + return var; +} +#pragma omp end declare variant + +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) { + int c = ((32 - width) << 8) | 0x1f; + return __nvvm_shfl_sync_bfly_i32(0xFFFFFFFF, var, lane_mask, c); +} +float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width) { + union { + int i; + unsigned u; + float f; + } tmp; + tmp.f = var; + tmp.i = xteamr_shfl_xor_int(tmp.i, lane_mask, width); + return tmp.f; +} +double xteamr_shfl_xor_d(double var, int laneMask, const uint32_t width) { + unsigned lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var)); + hi = xteamr_shfl_xor_int(hi, laneMask, width); + lo = xteamr_shfl_xor_int(lo, laneMask, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi)); + return var; +} +double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask, + const uint32_t width) { + __real__(var) = xteamr_shfl_xor_d(__real__(var), lane_mask, width); + __imag__(var) = xteamr_shfl_xor_d(__imag__(var), lane_mask, width); + return var; +} +float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask, + const uint32_t width) { + __real__(var) = xteamr_shfl_xor_f(__real__(var), lane_mask, width); + __imag__(var) = xteamr_shfl_xor_f(__imag__(var), lane_mask, width); + return var; +} +#pragma omp end declare variant +// } // end impl namespace + +// tag dispatching of type specific shfl_xor, get_low, and get_high +struct _d_tag {}; +struct _f_tag {}; +struct _cd_tag {}; +struct _cf_tag {}; +struct _i_tag {}; +struct _ui_tag {}; +struct _l_tag {}; +struct _ul_tag {}; +template struct __dispatch_tag; +template <> struct __dispatch_tag { + typedef _d_tag type; +}; +template <> struct __dispatch_tag { + typedef _f_tag type; +}; +template <> struct __dispatch_tag { + typedef _cd_tag type; +}; +template <> struct __dispatch_tag { + typedef _cf_tag type; +}; +template <> struct __dispatch_tag { + typedef _i_tag type; +}; +template <> struct __dispatch_tag { + typedef _ui_tag type; +}; +template <> struct __dispatch_tag { + typedef _l_tag type; +}; +template <> struct __dispatch_tag { + typedef _ul_tag type; +}; +template +double xteamr_shfl_xor(_d_tag tag, double var, const int lane_mask) { + return xteamr_shfl_xor_d(var, lane_mask, _WSZ); +} +template +float xteamr_shfl_xor(_f_tag tag, float var, const int lane_mask) { + return xteamr_shfl_xor_f(var, lane_mask, _WSZ); +} +template +double _Complex xteamr_shfl_xor(_cd_tag tag, double _Complex var, + const int lane_mask) { + return xteamr_shfl_xor_cd(var, lane_mask, _WSZ); +} +template +float _Complex xteamr_shfl_xor(_cf_tag tag, float _Complex var, + const int lane_mask) { + return xteamr_shfl_xor_cf(var, lane_mask, _WSZ); +} +template +int xteamr_shfl_xor(_i_tag tag, int var, const int lane_mask) { + return xteamr_shfl_xor_int(var, lane_mask, _WSZ); +} +template +unsigned int xteamr_shfl_xor(_ui_tag tag, unsigned int var, + const int lane_mask) { + return xteamr_shfl_xor_int(var, lane_mask, _WSZ); +} +template +long xteamr_shfl_xor(_l_tag tag, long var, const int lane_mask) { + return xteamr_shfl_xor_d(var, lane_mask, _WSZ); +} +template +unsigned long xteamr_shfl_xor(_ul_tag tag, unsigned long var, + const int lane_mask) { + return xteamr_shfl_xor_d(var, lane_mask, _WSZ); +} + +template +T xteamr_shfl_xor(T var, const int lane_mask) { + typedef typename __dispatch_tag::type tag; + return xteamr_shfl_xor<_WSZ>(tag(), var, lane_mask); +} + +template +__attribute__((flatten, always_inline)) void _xteam_reduction( + T val, T *r_ptr, T *team_vals, uint32_t *teams_done_ptr, + void (*_rf)(T *, T), + void (*_rf_lds)(__XTEAM_SHARED_LDS T *, __XTEAM_SHARED_LDS T *), + const T inival) { + + const uint32_t omp_thread_num = mapping::getThreadIdInBlock(); + const uint32_t omp_team_num = mapping::getBlockId(); + const uint32_t wave_num = mapping::getWarpId(); // 0 15 + const uint32_t lane_num = mapping::getThreadIdInWarp(); // 0 63 + const uint32_t NumTeams = mapping::getNumberOfBlocks(); + static __XTEAM_SHARED_LDS T xwave_lds[_NW + 1]; + static __XTEAM_SHARED_LDS bool __is_last_team; + +// Cuda may restrict max threads, so clear unused wave values +#ifdef __NVPTX__ + if (_NW == 32) { + if (omp_thread_num == 0) { + for (uint32_t i = (omp_get_num_threads() / 32); i < _NW; i++) + xwave_lds[i] = inival; + } + } +#endif + + // Binary reduce each wave, then copy to xwave_lds[wave_num] + for (unsigned int offset = _WSZ / 2; offset > 0; offset >>= 1) + (*_rf)(&val, xteamr_shfl_xor(val, offset)); + if (lane_num == 0) + xwave_lds[wave_num] = val; + + // Binary reduce all wave values into wave_lds[0] + _OMP::synchronize::threadsAligned(); + for (unsigned int offset = _NW / 2; offset > 0; offset >>= 1) { + if (omp_thread_num < offset) + (*_rf_lds)(&(xwave_lds[omp_thread_num]), + &(xwave_lds[omp_thread_num + offset])); + } + _OMP::synchronize::threadsAligned(); + + // Discover the last team to complete cross wave reduction + // The team number of last team is nondeterministic. + __is_last_team = false; + if (omp_thread_num == 0) { + team_vals[omp_team_num] = xwave_lds[0]; + uint32_t td = atomic::inc(teams_done_ptr, NumTeams - 1u, atomic::seq_cst); + if (td == (NumTeams - 1u)) + __is_last_team = true; + } + + // This sync needed so all threads from last team know they + // are in the last team. + _OMP::synchronize::threadsAligned(); + + if (__is_last_team) { + // All threads from last completed team enter here. + // All other teams exit the helper function. + // must have more threads than teams + if (omp_thread_num < NumTeams) + val = team_vals[omp_thread_num]; + else + val = inival; + + _OMP::synchronize::threadsAligned(); + + // Reduce each wave into xwave_lds[wave_num] + for (unsigned int offset = _WSZ / 2; offset > 0; offset >>= 1) + (*_rf)(&val, xteamr_shfl_xor(val, offset)); + if (lane_num == 0) + xwave_lds[wave_num] = val; + + // To get final result, we know wave_lds[0] is done + // But we need a sync here to ensure wave_lds[i!=0] is correct. + _OMP::synchronize::threadsAligned(); + + // Typically only 2 useable waves when <128 CUs. No gain to parallelizing + // these last 2 reductions. So do these on thread 0 into lane 0's val. + if (omp_thread_num == 0) { + unsigned int usableWaves = ((NumTeams - 1) / _WSZ) + 1; + // Reduce with the original result value. + xwave_lds[usableWaves] = *r_ptr; + for (unsigned int kk = 1; kk <= usableWaves; kk++) + (*_rf_lds)(&xwave_lds[0], &xwave_lds[kk]); + + *r_ptr = xwave_lds[0]; + } + + // This sync needed to prevent warps in last team from starting + // if there was another reduction. + _OMP::synchronize::threadsAligned(); + } +} + +// Calls to these __kmpc extern C functions are created in clang codegen +// for FORTRAN, c, and C++. They may also be used for sumulation and testing. +// The headers for these extern C functions are in ../include/Interface.h +// The compiler builds the name based on data type, +// number of waves in the team,and warpsize. +// +#define _EXT_ATTR extern "C" __attribute__((flatten, always_inline)) void +_EXT_ATTR +__kmpc_xteamr_d_16x64(double v, double *r_ptr, double *tvals, uint32_t *td_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(__XTEAM_SHARED_LDS double *, + __XTEAM_SHARED_LDS double *), + const double iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_f_16x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(__XTEAM_SHARED_LDS float *, + __XTEAM_SHARED_LDS float *), + const float iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +_EXT_ATTR +__kmpc_xteamr_cd_16x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *, + __XTEAM_SHARED_LDS double _Complex *), + double _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cf_16x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *, + __XTEAM_SHARED_LDS float _Complex *), + float _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_i_16x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(__XTEAM_SHARED_LDS int *, + __XTEAM_SHARED_LDS int *), + const int iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ui_16x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *, + __XTEAM_SHARED_LDS uint32_t *), + const uint32_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_l_16x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(__XTEAM_SHARED_LDS long *, + __XTEAM_SHARED_LDS long *), + const long iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ul_16x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *, + __XTEAM_SHARED_LDS uint64_t *), + const uint64_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +_EXT_ATTR +__kmpc_xteamr_d_8x64(double v, double *r_ptr, double *tvals, uint32_t *td_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(__XTEAM_SHARED_LDS double *, + __XTEAM_SHARED_LDS double *), + const double iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_f_8x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(__XTEAM_SHARED_LDS float *, + __XTEAM_SHARED_LDS float *), + const float iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cd_8x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *, + __XTEAM_SHARED_LDS double _Complex *), + const double _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cf_8x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *, + __XTEAM_SHARED_LDS float _Complex *), + const float _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, + iv); +} +_EXT_ATTR +__kmpc_xteamr_i_8x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(__XTEAM_SHARED_LDS int *, + __XTEAM_SHARED_LDS int *), + const int iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ui_8x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *, + __XTEAM_SHARED_LDS uint32_t *), + const uint32_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_l_8x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(__XTEAM_SHARED_LDS long *, + __XTEAM_SHARED_LDS long *), + const long iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ul_8x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *, + __XTEAM_SHARED_LDS uint64_t *), + const uint64_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +_EXT_ATTR +__kmpc_xteamr_d_4x64(double v, double *r_ptr, double *tvals, uint32_t *td_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(__XTEAM_SHARED_LDS double *, + __XTEAM_SHARED_LDS double *), + const double iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_f_4x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(__XTEAM_SHARED_LDS float *, + __XTEAM_SHARED_LDS float *), + const float iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cd_4x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *, + __XTEAM_SHARED_LDS double _Complex *), + const double _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cf_4x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *, + __XTEAM_SHARED_LDS float _Complex *), + const float _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, + iv); +} +_EXT_ATTR +__kmpc_xteamr_i_4x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(__XTEAM_SHARED_LDS int *, + __XTEAM_SHARED_LDS int *), + const int iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ui_4x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *, + __XTEAM_SHARED_LDS uint32_t *), + const uint32_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_l_4x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(__XTEAM_SHARED_LDS long *, + __XTEAM_SHARED_LDS long *), + const long iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ul_4x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *, + __XTEAM_SHARED_LDS uint64_t *), + const uint64_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +_EXT_ATTR +__kmpc_xteamr_d_32x32(double v, double *r_ptr, double *tvals, uint32_t *td_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(__XTEAM_SHARED_LDS double *, + __XTEAM_SHARED_LDS double *), + const double iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_f_32x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(__XTEAM_SHARED_LDS float *, + __XTEAM_SHARED_LDS float *), + const float iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cd_32x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *, + __XTEAM_SHARED_LDS double _Complex *), + const double _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cf_32x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *, + __XTEAM_SHARED_LDS float _Complex *), + const float _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_i_32x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(__XTEAM_SHARED_LDS int *, + __XTEAM_SHARED_LDS int *), + const int iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ui_32x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *, + __XTEAM_SHARED_LDS uint32_t *), + const uint32_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_l_32x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(__XTEAM_SHARED_LDS long *, + __XTEAM_SHARED_LDS long *), + const long iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ul_32x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *, + __XTEAM_SHARED_LDS uint64_t *), + const uint64_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +_EXT_ATTR +__kmpc_xteamr_d_16x32(double v, double *r_ptr, double *tvals, uint32_t *td_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(__XTEAM_SHARED_LDS double *, + __XTEAM_SHARED_LDS double *), + const double iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_f_16x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(__XTEAM_SHARED_LDS float *, + __XTEAM_SHARED_LDS float *), + const float iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cd_16x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *, + __XTEAM_SHARED_LDS double _Complex *), + const double _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cf_16x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *, + __XTEAM_SHARED_LDS float _Complex *), + const float _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_i_16x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(__XTEAM_SHARED_LDS int *, + __XTEAM_SHARED_LDS int *), + const int iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ui_16x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *, + __XTEAM_SHARED_LDS uint32_t *), + const uint32_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_l_16x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(__XTEAM_SHARED_LDS long *, + __XTEAM_SHARED_LDS long *), + const long iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ul_16x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *, + __XTEAM_SHARED_LDS uint64_t *), + const uint64_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +_EXT_ATTR +__kmpc_xteamr_d_8x32(double v, double *r_ptr, double *tvals, uint32_t *td_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(__XTEAM_SHARED_LDS double *, + __XTEAM_SHARED_LDS double *), + const double iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_f_8x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(__XTEAM_SHARED_LDS float *, + __XTEAM_SHARED_LDS float *), + const float iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cd_8x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *, + __XTEAM_SHARED_LDS double _Complex *), + const double _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cf_8x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *, + __XTEAM_SHARED_LDS float _Complex *), + const float _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, + iv); +} +_EXT_ATTR +__kmpc_xteamr_i_8x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(__XTEAM_SHARED_LDS int *, + __XTEAM_SHARED_LDS int *), + const int iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ui_8x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *, + __XTEAM_SHARED_LDS uint32_t *), + const uint32_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_l_8x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(__XTEAM_SHARED_LDS long *, + __XTEAM_SHARED_LDS long *), + const long iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ul_8x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *, + __XTEAM_SHARED_LDS uint64_t *), + const uint64_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +// Built-in pair reduction functions used as function pointers for +// cross team reduction functions. + +#define _RF_LDS volatile __attribute__((address_space(3))) + +_EXT_ATTR __kmpc_rfun_sum_d(double *val, double otherval) { *val += otherval; } +_EXT_ATTR __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) { + *val += *otherval; +} +_EXT_ATTR __kmpc_rfun_sum_f(float *val, float otherval) { *val += otherval; } +_EXT_ATTR __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) { + *val += *otherval; +} +_EXT_ATTR __kmpc_rfun_sum_cd(double _Complex *val, double _Complex otherval) { + *val += otherval; +} +_EXT_ATTR __kmpc_rfun_sum_lds_cd(_RF_LDS double _Complex *val, + _RF_LDS double _Complex *otherval) { + *val += *otherval; +} +_EXT_ATTR __kmpc_rfun_sum_cf(float _Complex *val, float _Complex otherval) { + *val += otherval; +} +_EXT_ATTR __kmpc_rfun_sum_lds_cf(_RF_LDS float _Complex *val, + _RF_LDS float _Complex *otherval) { + *val += *otherval; +} +_EXT_ATTR __kmpc_rfun_sum_i(int *val, int otherval) { *val += otherval; } +_EXT_ATTR __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) { + *val += *otherval; +} +_EXT_ATTR __kmpc_rfun_sum_ui(unsigned int *val, unsigned int otherval) { + *val += otherval; +} +_EXT_ATTR __kmpc_rfun_sum_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval) { + *val += *otherval; +} +_EXT_ATTR __kmpc_rfun_sum_l(long *val, long otherval) { *val += otherval; } +_EXT_ATTR __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) { + *val += *otherval; +} +_EXT_ATTR __kmpc_rfun_sum_ul(unsigned long *val, unsigned long otherval) { + *val += otherval; +} +_EXT_ATTR __kmpc_rfun_sum_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval) { + *val += *otherval; +} + +_EXT_ATTR __kmpc_rfun_min_d(double *val, double otherval) { + *val = (otherval < *val) ? otherval : *val; +} +_EXT_ATTR __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) { + *val = (*otherval < *val) ? *otherval : *val; +} +_EXT_ATTR __kmpc_rfun_min_f(float *val, float otherval) { + *val = (otherval < *val) ? otherval : *val; +} +_EXT_ATTR __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) { + *val = (*otherval < *val) ? *otherval : *val; +} +_EXT_ATTR __kmpc_rfun_min_i(int *val, int otherval) { + *val = (otherval < *val) ? otherval : *val; +} +_EXT_ATTR __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) { + *val = (*otherval < *val) ? *otherval : *val; +} +_EXT_ATTR __kmpc_rfun_min_ui(unsigned int *val, unsigned int otherval) { + *val = (otherval < *val) ? otherval : *val; +} +_EXT_ATTR __kmpc_rfun_min_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval) { + *val = (*otherval < *val) ? *otherval : *val; +} +_EXT_ATTR __kmpc_rfun_min_l(long *val, long otherval) { + *val = (otherval < *val) ? otherval : *val; +} +_EXT_ATTR __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) { + *val = (*otherval < *val) ? *otherval : *val; +} +_EXT_ATTR __kmpc_rfun_min_ul(unsigned long *val, unsigned long otherval) { + *val = (otherval < *val) ? otherval : *val; +} +_EXT_ATTR __kmpc_rfun_min_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval) { + *val = (*otherval < *val) ? *otherval : *val; +} + +_EXT_ATTR __kmpc_rfun_max_d(double *val, double otherval) { + *val = (otherval > *val) ? otherval : *val; +} +_EXT_ATTR __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) { + *val = (*otherval > *val) ? *otherval : *val; +} +_EXT_ATTR __kmpc_rfun_max_f(float *val, float otherval) { + *val = (otherval > *val) ? otherval : *val; +} +_EXT_ATTR __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) { + *val = (*otherval > *val) ? *otherval : *val; +} +_EXT_ATTR __kmpc_rfun_max_i(int *val, int otherval) { + *val = (otherval > *val) ? otherval : *val; +} +_EXT_ATTR __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) { + *val = (*otherval > *val) ? *otherval : *val; +} +_EXT_ATTR __kmpc_rfun_max_ui(unsigned int *val, unsigned int otherval) { + *val = (otherval > *val) ? otherval : *val; +} +_EXT_ATTR __kmpc_rfun_max_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval) { + *val = (*otherval > *val) ? *otherval : *val; +} +_EXT_ATTR __kmpc_rfun_max_l(long *val, long otherval) { + *val = (otherval > *val) ? otherval : *val; +} +_EXT_ATTR __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) { + *val = (*otherval > *val) ? *otherval : *val; +} +_EXT_ATTR __kmpc_rfun_max_ul(unsigned long *val, unsigned long otherval) { + *val = (otherval > *val) ? otherval : *val; +} +_EXT_ATTR __kmpc_rfun_max_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval) { + *val = (*otherval > *val) ? *otherval : *val; +} + +#undef _EXT_ATTR +#undef _RF_LDS + +#pragma omp end declare target diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg --- a/openmp/libomptarget/test/lit.cfg +++ b/openmp/libomptarget/test/lit.cfg @@ -99,6 +99,7 @@ config.test_flags += " -Wl,-rpath," + config.library_dir config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory config.test_flags += " -Wl,-rpath," + config.llvm_lib_directory + config.test_flags += " -latomic" if config.cuda_libdir: config.test_flags += " -Wl,-rpath," + config.cuda_libdir if config.libomptarget_current_target.startswith('amdgcn'): diff --git a/openmp/libomptarget/test/xteamr/test_xteamr.h b/openmp/libomptarget/test/xteamr/test_xteamr.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/xteamr/test_xteamr.h @@ -0,0 +1,892 @@ + +// Header file: overload_to_externs.h +// generated by utility gen_externs + +#define _CD double _Complex +#define _CF float _Complex +#define _UI unsigned int +#define _UL unsigned long +#define _INLINE_ATTR_ __attribute__((flatten, always_inline)) + +// Headers for extern xteamr functions defined in libomptarget DeviceRTL +// are defined here in test application because user apps cannot include +// the DeviceRTL Interface.h header file. + +#if defined(__AMDGCN__) || defined(__NVPTX__) +extern "C" { +#define _RF_LDS volatile __attribute__((address_space(3))) +void _INLINE_ATTR_ __kmpc_xteamr_d_16x64 + (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv); +void _INLINE_ATTR_ __kmpc_xteamr_f_16x64 + (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv); +void _INLINE_ATTR_ __kmpc_xteamr_cd_16x64 + (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv); +void _INLINE_ATTR_ __kmpc_xteamr_cf_16x64 + (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv); +void _INLINE_ATTR_ __kmpc_xteamr_i_16x64 + (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv); +void _INLINE_ATTR_ __kmpc_xteamr_ui_16x64 + (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv); +void _INLINE_ATTR_ __kmpc_xteamr_l_16x64 + (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv); +void _INLINE_ATTR_ __kmpc_xteamr_ul_16x64 + (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv); +void _INLINE_ATTR_ __kmpc_xteamr_d_32x32 + (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv); +void _INLINE_ATTR_ __kmpc_xteamr_f_32x32 + (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv); +void _INLINE_ATTR_ __kmpc_xteamr_cd_32x32 + (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv); +void _INLINE_ATTR_ __kmpc_xteamr_cf_32x32 + (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv); +void _INLINE_ATTR_ __kmpc_xteamr_i_32x32 + (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv); +void _INLINE_ATTR_ __kmpc_xteamr_ui_32x32 + (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv); +void _INLINE_ATTR_ __kmpc_xteamr_l_32x32 + (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv); +void _INLINE_ATTR_ __kmpc_xteamr_ul_32x32 + (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv); +void _INLINE_ATTR_ __kmpc_xteamr_d_8x64 + (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv); +void _INLINE_ATTR_ __kmpc_xteamr_f_8x64 + (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv); +void _INLINE_ATTR_ __kmpc_xteamr_cd_8x64 + (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv); +void _INLINE_ATTR_ __kmpc_xteamr_cf_8x64 + (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv); +void _INLINE_ATTR_ __kmpc_xteamr_i_8x64 + (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv); +void _INLINE_ATTR_ __kmpc_xteamr_ui_8x64 + (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv); +void _INLINE_ATTR_ __kmpc_xteamr_l_8x64 + (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv); +void _INLINE_ATTR_ __kmpc_xteamr_ul_8x64 + (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv); +void _INLINE_ATTR_ __kmpc_xteamr_d_16x32 + (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv); +void _INLINE_ATTR_ __kmpc_xteamr_f_16x32 + (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv); +void _INLINE_ATTR_ __kmpc_xteamr_cd_16x32 + (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv); +void _INLINE_ATTR_ __kmpc_xteamr_cf_16x32 + (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv); +void _INLINE_ATTR_ __kmpc_xteamr_i_16x32 + (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv); +void _INLINE_ATTR_ __kmpc_xteamr_ui_16x32 + (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv); +void _INLINE_ATTR_ __kmpc_xteamr_l_16x32 + (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv); +void _INLINE_ATTR_ __kmpc_xteamr_ul_16x32 + (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv); +void _INLINE_ATTR_ __kmpc_xteamr_d_4x64 + (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv); +void _INLINE_ATTR_ __kmpc_xteamr_f_4x64 + (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv); +void _INLINE_ATTR_ __kmpc_xteamr_cd_4x64 + (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv); +void _INLINE_ATTR_ __kmpc_xteamr_cf_4x64 + (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv); +void _INLINE_ATTR_ __kmpc_xteamr_i_4x64 + (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv); +void _INLINE_ATTR_ __kmpc_xteamr_ui_4x64 + (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv); +void _INLINE_ATTR_ __kmpc_xteamr_l_4x64 + (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv); +void _INLINE_ATTR_ __kmpc_xteamr_ul_4x64 + (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv); +void _INLINE_ATTR_ __kmpc_xteamr_d_8x32 + (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv); +void _INLINE_ATTR_ __kmpc_xteamr_f_8x32 + (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv); +void _INLINE_ATTR_ __kmpc_xteamr_cd_8x32 + (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv); +void _INLINE_ATTR_ __kmpc_xteamr_cf_8x32 + (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv); +void _INLINE_ATTR_ __kmpc_xteamr_i_8x32 + (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv); +void _INLINE_ATTR_ __kmpc_xteamr_ui_8x32 + (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv); +void _INLINE_ATTR_ __kmpc_xteamr_l_8x32 + (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv); +void _INLINE_ATTR_ __kmpc_xteamr_ul_8x32 + (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv); +void __kmpc_rfun_sum_d(double *val, double otherval); +void __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval); +void __kmpc_rfun_sum_f(float *val, float otherval); +void __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval); +void __kmpc_rfun_sum_cd(_CD *val, _CD otherval); +void __kmpc_rfun_sum_lds_cd(_RF_LDS _CD *val, _RF_LDS _CD *otherval); +void __kmpc_rfun_sum_cf(_CF *val, _CF otherval); +void __kmpc_rfun_sum_lds_cf(_RF_LDS _CF *val, _RF_LDS _CF *otherval); +void __kmpc_rfun_sum_i(int *val, int otherval); +void __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval); +void __kmpc_rfun_sum_ui(_UI *val, _UI otherval); +void __kmpc_rfun_sum_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval); +void __kmpc_rfun_sum_l(long *val, long otherval); +void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); +void __kmpc_rfun_sum_ul(_UL *val, _UL otherval); +void __kmpc_rfun_sum_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval); +void __kmpc_rfun_max_d(double *val, double otherval); +void __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval); +void __kmpc_rfun_max_f(float *val, float otherval); +void __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval); +void __kmpc_rfun_max_i(int *val, int otherval); +void __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval); +void __kmpc_rfun_max_ui(_UI *val, _UI otherval); +void __kmpc_rfun_max_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval); +void __kmpc_rfun_max_l(long *val, long otherval); +void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); +void __kmpc_rfun_max_ul(_UL *val, _UL otherval); +void __kmpc_rfun_max_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval); +void __kmpc_rfun_min_d(double *val, double otherval); +void __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval); +void __kmpc_rfun_min_f(float *val, float otherval); +void __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval); +void __kmpc_rfun_min_i(int *val, int otherval); +void __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval); +void __kmpc_rfun_min_ui(_UI *val, _UI otherval); +void __kmpc_rfun_min_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval); +void __kmpc_rfun_min_l(long *val, long otherval); +void __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); +void __kmpc_rfun_min_ul(_UL *val, _UL otherval); +void __kmpc_rfun_min_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval); +#undef _RF_LDS +int __kmpc_get_warp_size(); +} // end extern C + +#else + +// For host compilation, define null functions for host linking. + +extern "C" { +#undef _RF_LDS +#define _RF_LDS +void __kmpc_xteamr_d_16x64 + (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv){} +void __kmpc_xteamr_f_16x64 + (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv){} +void __kmpc_xteamr_cd_16x64 + (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv){} +void __kmpc_xteamr_cf_16x64 + (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv){} +void __kmpc_xteamr_i_16x64 + (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv){} +void __kmpc_xteamr_ui_16x64 + (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv){} +void __kmpc_xteamr_l_16x64 + (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv){} +void __kmpc_xteamr_ul_16x64 + (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv){} +void __kmpc_xteamr_d_32x32 + (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv){} +void __kmpc_xteamr_f_32x32 + (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv){} +void __kmpc_xteamr_cd_32x32 + (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv){} +void __kmpc_xteamr_cf_32x32 + (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv){} +void __kmpc_xteamr_i_32x32 + (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv){} +void __kmpc_xteamr_ui_32x32 + (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv){} +void __kmpc_xteamr_l_32x32 + (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv){} +void __kmpc_xteamr_ul_32x32 + (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv){} +void __kmpc_xteamr_d_8x64 + (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv){} +void __kmpc_xteamr_f_8x64 + (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv){} +void __kmpc_xteamr_cd_8x64 + (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv){} +void __kmpc_xteamr_cf_8x64 + (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv){} +void __kmpc_xteamr_i_8x64 + (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv){} +void __kmpc_xteamr_ui_8x64 + (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv){} +void __kmpc_xteamr_l_8x64 + (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv){} +void __kmpc_xteamr_ul_8x64 + (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv){} +void __kmpc_xteamr_d_16x32 + (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv){} +void __kmpc_xteamr_f_16x32 + (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv){} +void __kmpc_xteamr_cd_16x32 + (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv){} +void __kmpc_xteamr_cf_16x32 + (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv){} +void __kmpc_xteamr_i_16x32 + (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv){} +void __kmpc_xteamr_ui_16x32 + (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv){} +void __kmpc_xteamr_l_16x32 + (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv){} +void __kmpc_xteamr_ul_16x32 + (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv){} +void __kmpc_xteamr_d_4x64 + (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv){} +void __kmpc_xteamr_f_4x64 + (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv){} +void __kmpc_xteamr_cd_4x64 + (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv){} +void __kmpc_xteamr_cf_4x64 + (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv){} +void __kmpc_xteamr_i_4x64 + (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv){} +void __kmpc_xteamr_ui_4x64 + (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv){} +void __kmpc_xteamr_l_4x64 + (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv){} +void __kmpc_xteamr_ul_4x64 + (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv){} +void __kmpc_xteamr_d_8x32 + (double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double iv){} +void __kmpc_xteamr_f_8x32 + (float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float iv){} +void __kmpc_xteamr_cd_8x32 + (_CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD iv){} +void __kmpc_xteamr_cf_8x32 + (_CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF iv){} +void __kmpc_xteamr_i_8x32 + (int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int iv){} +void __kmpc_xteamr_ui_8x32 + (_UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI iv){} +void __kmpc_xteamr_l_8x32 + (long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long iv){} +void __kmpc_xteamr_ul_8x32 + (_UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL iv){} +void __kmpc_rfun_sum_d(double *val, double otherval){} +void __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval){} +void __kmpc_rfun_sum_f(float *val, float otherval){} +void __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval){} +void __kmpc_rfun_sum_cd(_CD *val, _CD otherval){} +void __kmpc_rfun_sum_lds_cd(_RF_LDS _CD *val, _RF_LDS _CD *otherval){} +void __kmpc_rfun_sum_cf(_CF *val, _CF otherval){} +void __kmpc_rfun_sum_lds_cf(_RF_LDS _CF *val, _RF_LDS _CF *otherval){} +void __kmpc_rfun_sum_i(int *val, int otherval){} +void __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval){} +void __kmpc_rfun_sum_ui(_UI *val, _UI otherval){} +void __kmpc_rfun_sum_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval){} +void __kmpc_rfun_sum_l(long *val, long otherval){} +void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval){} +void __kmpc_rfun_sum_ul(_UL *val, _UL otherval){} +void __kmpc_rfun_sum_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval){} +void __kmpc_rfun_max_d(double *val, double otherval){} +void __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval){} +void __kmpc_rfun_max_f(float *val, float otherval){} +void __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval){} +void __kmpc_rfun_max_i(int *val, int otherval){} +void __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval){} +void __kmpc_rfun_max_ui(_UI *val, _UI otherval){} +void __kmpc_rfun_max_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval){} +void __kmpc_rfun_max_l(long *val, long otherval){} +void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval){} +void __kmpc_rfun_max_ul(_UL *val, _UL otherval){} +void __kmpc_rfun_max_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval){} +void __kmpc_rfun_min_d(double *val, double otherval){} +void __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval){} +void __kmpc_rfun_min_f(float *val, float otherval){} +void __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval){} +void __kmpc_rfun_min_i(int *val, int otherval){} +void __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval){} +void __kmpc_rfun_min_ui(_UI *val, _UI otherval){} +void __kmpc_rfun_min_lds_ui(_RF_LDS _UI *val, _RF_LDS _UI *otherval){} +void __kmpc_rfun_min_l(long *val, long otherval){} +void __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval){} +void __kmpc_rfun_min_ul(_UL *val, _UL otherval){} +void __kmpc_rfun_min_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval){} +#undef _RF_LDS +int __kmpc_get_warp_size(){ + printf("ERROR: executing _kmpc_get_warp_size on host\n"); + return -1;} +} // end extern C + +#endif // of definitions for host null functions + +// These overloaded function definitions are for this test framework +// (xteamr.cpp) to invoke the extern DexviceRTL helper functions. + +void _INLINE_ATTR_ _overload_to_extern_sum_16x64 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_16x64(val, rv, tvs, td, + __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x64 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_16x64(val, rv, tvs, td, + __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x64 + (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv) + { __kmpc_xteamr_cd_16x64(val, rv, tvs, td, + __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x64 + (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv) + { __kmpc_xteamr_cf_16x64(val, rv, tvs, td, + __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x64 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_16x64(val, rv, tvs, td, + __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x64 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_16x64(val, rv, tvs, td, + __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x64 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_16x64(val, rv, tvs, td, + __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x64 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_16x64(val, rv, tvs, td, + __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_32x32 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_32x32(val, rv, tvs, td, + __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_32x32 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_32x32(val, rv, tvs, td, + __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_32x32 + (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv) + { __kmpc_xteamr_cd_32x32(val, rv, tvs, td, + __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_32x32 + (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv) + { __kmpc_xteamr_cf_32x32(val, rv, tvs, td, + __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_32x32 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_32x32(val, rv, tvs, td, + __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_32x32 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_32x32(val, rv, tvs, td, + __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_32x32 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_32x32(val, rv, tvs, td, + __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_32x32 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_32x32(val, rv, tvs, td, + __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x64 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_8x64(val, rv, tvs, td, + __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x64 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_8x64(val, rv, tvs, td, + __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x64 + (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv) + { __kmpc_xteamr_cd_8x64(val, rv, tvs, td, + __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x64 + (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv) + { __kmpc_xteamr_cf_8x64(val, rv, tvs, td, + __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x64 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_8x64(val, rv, tvs, td, + __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x64 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_8x64(val, rv, tvs, td, + __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x64 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_8x64(val, rv, tvs, td, + __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x64 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_8x64(val, rv, tvs, td, + __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x32 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_16x32(val, rv, tvs, td, + __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x32 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_16x32(val, rv, tvs, td, + __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x32 + (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv) + { __kmpc_xteamr_cd_16x32(val, rv, tvs, td, + __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x32 + (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv) + { __kmpc_xteamr_cf_16x32(val, rv, tvs, td, + __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x32 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_16x32(val, rv, tvs, td, + __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x32 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_16x32(val, rv, tvs, td, + __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x32 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_16x32(val, rv, tvs, td, + __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_16x32 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_16x32(val, rv, tvs, td, + __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_4x64 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_4x64(val, rv, tvs, td, + __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_4x64 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_4x64(val, rv, tvs, td, + __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_4x64 + (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv) + { __kmpc_xteamr_cd_4x64(val, rv, tvs, td, + __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_4x64 + (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv) + { __kmpc_xteamr_cf_4x64(val, rv, tvs, td, + __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_4x64 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_4x64(val, rv, tvs, td, + __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_4x64 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_4x64(val, rv, tvs, td, + __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_4x64 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_4x64(val, rv, tvs, td, + __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_4x64 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_4x64(val, rv, tvs, td, + __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x32 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_8x32(val, rv, tvs, td, + __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x32 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_8x32(val, rv, tvs, td, + __kmpc_rfun_sum_f, __kmpc_rfun_sum_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x32 + (_CD val, _CD *rv, _CD *tvs, uint32_t *td, const _CD iv) + { __kmpc_xteamr_cd_8x32(val, rv, tvs, td, + __kmpc_rfun_sum_cd, __kmpc_rfun_sum_lds_cd, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x32 + (_CF val, _CF *rv, _CF *tvs, uint32_t *td, const _CF iv) + { __kmpc_xteamr_cf_8x32(val, rv, tvs, td, + __kmpc_rfun_sum_cf, __kmpc_rfun_sum_lds_cf, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x32 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_8x32(val, rv, tvs, td, + __kmpc_rfun_sum_i, __kmpc_rfun_sum_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x32 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_8x32(val, rv, tvs, td, + __kmpc_rfun_sum_ui, __kmpc_rfun_sum_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x32 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_8x32(val, rv, tvs, td, + __kmpc_rfun_sum_l, __kmpc_rfun_sum_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_sum_8x32 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_8x32(val, rv, tvs, td, + __kmpc_rfun_sum_ul, __kmpc_rfun_sum_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_16x64 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_16x64(val, rv, tvs, td, + __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_16x64 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_16x64(val, rv, tvs, td, + __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_16x64 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_16x64(val, rv, tvs, td, + __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_16x64 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_16x64(val, rv, tvs, td, + __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_16x64 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_16x64(val, rv, tvs, td, + __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_16x64 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_16x64(val, rv, tvs, td, + __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_32x32 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_32x32(val, rv, tvs, td, + __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_32x32 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_32x32(val, rv, tvs, td, + __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_32x32 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_32x32(val, rv, tvs, td, + __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_32x32 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_32x32(val, rv, tvs, td, + __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_32x32 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_32x32(val, rv, tvs, td, + __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_32x32 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_32x32(val, rv, tvs, td, + __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_8x64 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_8x64(val, rv, tvs, td, + __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_8x64 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_8x64(val, rv, tvs, td, + __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_8x64 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_8x64(val, rv, tvs, td, + __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_8x64 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_8x64(val, rv, tvs, td, + __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_8x64 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_8x64(val, rv, tvs, td, + __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_8x64 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_8x64(val, rv, tvs, td, + __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_16x32 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_16x32(val, rv, tvs, td, + __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_16x32 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_16x32(val, rv, tvs, td, + __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_16x32 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_16x32(val, rv, tvs, td, + __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_16x32 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_16x32(val, rv, tvs, td, + __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_16x32 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_16x32(val, rv, tvs, td, + __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_16x32 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_16x32(val, rv, tvs, td, + __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_4x64 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_4x64(val, rv, tvs, td, + __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_4x64 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_4x64(val, rv, tvs, td, + __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_4x64 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_4x64(val, rv, tvs, td, + __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_4x64 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_4x64(val, rv, tvs, td, + __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_4x64 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_4x64(val, rv, tvs, td, + __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_4x64 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_4x64(val, rv, tvs, td, + __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_8x32 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_8x32(val, rv, tvs, td, + __kmpc_rfun_max_d, __kmpc_rfun_max_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_8x32 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_8x32(val, rv, tvs, td, + __kmpc_rfun_max_f, __kmpc_rfun_max_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_8x32 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_8x32(val, rv, tvs, td, + __kmpc_rfun_max_i, __kmpc_rfun_max_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_8x32 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_8x32(val, rv, tvs, td, + __kmpc_rfun_max_ui, __kmpc_rfun_max_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_8x32 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_8x32(val, rv, tvs, td, + __kmpc_rfun_max_l, __kmpc_rfun_max_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_max_8x32 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_8x32(val, rv, tvs, td, + __kmpc_rfun_max_ul, __kmpc_rfun_max_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_16x64 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_16x64(val, rv, tvs, td, + __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_16x64 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_16x64(val, rv, tvs, td, + __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_16x64 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_16x64(val, rv, tvs, td, + __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_16x64 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_16x64(val, rv, tvs, td, + __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_16x64 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_16x64(val, rv, tvs, td, + __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_16x64 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_16x64(val, rv, tvs, td, + __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_32x32 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_32x32(val, rv, tvs, td, + __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_32x32 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_32x32(val, rv, tvs, td, + __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_32x32 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_32x32(val, rv, tvs, td, + __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_32x32 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_32x32(val, rv, tvs, td, + __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_32x32 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_32x32(val, rv, tvs, td, + __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_32x32 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_32x32(val, rv, tvs, td, + __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_8x64 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_8x64(val, rv, tvs, td, + __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_8x64 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_8x64(val, rv, tvs, td, + __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_8x64 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_8x64(val, rv, tvs, td, + __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_8x64 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_8x64(val, rv, tvs, td, + __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_8x64 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_8x64(val, rv, tvs, td, + __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_8x64 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_8x64(val, rv, tvs, td, + __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_16x32 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_16x32(val, rv, tvs, td, + __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_16x32 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_16x32(val, rv, tvs, td, + __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_16x32 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_16x32(val, rv, tvs, td, + __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_16x32 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_16x32(val, rv, tvs, td, + __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_16x32 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_16x32(val, rv, tvs, td, + __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_16x32 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_16x32(val, rv, tvs, td, + __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_4x64 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_4x64(val, rv, tvs, td, + __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_4x64 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_4x64(val, rv, tvs, td, + __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_4x64 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_4x64(val, rv, tvs, td, + __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_4x64 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_4x64(val, rv, tvs, td, + __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_4x64 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_4x64(val, rv, tvs, td, + __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_4x64 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_4x64(val, rv, tvs, td, + __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_8x32 + (double val, double *rv, double *tvs, uint32_t *td, const double iv) + { __kmpc_xteamr_d_8x32(val, rv, tvs, td, + __kmpc_rfun_min_d, __kmpc_rfun_min_lds_d, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_8x32 + (float val, float *rv, float *tvs, uint32_t *td, const float iv) + { __kmpc_xteamr_f_8x32(val, rv, tvs, td, + __kmpc_rfun_min_f, __kmpc_rfun_min_lds_f, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_8x32 + (int val, int *rv, int *tvs, uint32_t *td, const int iv) + { __kmpc_xteamr_i_8x32(val, rv, tvs, td, + __kmpc_rfun_min_i, __kmpc_rfun_min_lds_i, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_8x32 + (_UI val, _UI *rv, _UI *tvs, uint32_t *td, const _UI iv) + { __kmpc_xteamr_ui_8x32(val, rv, tvs, td, + __kmpc_rfun_min_ui, __kmpc_rfun_min_lds_ui, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_8x32 + (long val, long *rv, long *tvs, uint32_t *td, const long iv) + { __kmpc_xteamr_l_8x32(val, rv, tvs, td, + __kmpc_rfun_min_l, __kmpc_rfun_min_lds_l, iv);} +void _INLINE_ATTR_ _overload_to_extern_min_8x32 + (_UL val, _UL *rv, _UL *tvs, uint32_t *td, const _UL iv) + { __kmpc_xteamr_ul_8x32(val, rv, tvs, td, + __kmpc_rfun_min_ul, __kmpc_rfun_min_lds_ul, iv);} +#undef _CD +#undef _CF +#undef _UI +#undef _UL +#undef _INLINE_ATTR_ diff --git a/openmp/libomptarget/test/xteamr/test_xteamr.cpp b/openmp/libomptarget/test/xteamr/test_xteamr.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/xteamr/test_xteamr.cpp @@ -0,0 +1,712 @@ +//===----- test_xteamr.cpp - Test for Xteamr DeviceRTL functions ---C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// performance and functional tests for Xteamr reduction helper functions in +// libomptarget/DeviceRTL/Xteamr.cpp +// +// RUN: %libomptarget-compileoptxx-run-and-check-nvptx64-nvidia-cuda +// REQUIRES: nvptx64-nvidia-cuda +// CHECK: ALL TESTS PASSED +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_xteamr.h" + +#ifndef _ARRAY_SIZE +#define _ARRAY_SIZE 33554432 +#endif +const uint64_t ARRAY_SIZE = _ARRAY_SIZE; +unsigned int repeat_num_times = 12; +unsigned int ignore_times = + 2; // ignore this many timings first + +// If we know at compile time that we have 0 index with 1 stride, +// then generate an optimized _BIG_JUMP_LOOP. +// This test case has index 0 and stride 1, so we set this here. +#define __OPTIMIZE_INDEX0_STRIDE1 + +// Extern Xteamr functions are designed for 1024, 512, and 256 thread blocks. +// The default here is 512. + +#ifndef _XTEAM_NUM_THREADS +#define _XTEAM_NUM_THREADS 512 +#endif +#ifndef _XTEAM_NUM_TEAMS +#define _XTEAM_NUM_TEAMS 80 +#endif + +#if _XTEAM_NUM_THREADS == 1024 +#define _SUM_OVERLOAD_64_FCT _overload_to_extern_sum_16x64 +#define _SUM_OVERLOAD_32_FCT _overload_to_extern_sum_32x32 +#define _MAX_OVERLOAD_64_FCT _overload_to_extern_max_16x64 +#define _MAX_OVERLOAD_32_FCT _overload_to_extern_max_32x32 +#define _MIN_OVERLOAD_64_FCT _overload_to_extern_min_16x64 +#define _MIN_OVERLOAD_32_FCT _overload_to_extern_min_32x32 +#elif _XTEAM_NUM_THREADS == 512 +#define _SUM_OVERLOAD_64_FCT _overload_to_extern_sum_8x64 +#define _SUM_OVERLOAD_32_FCT _overload_to_extern_sum_16x32 +#define _MAX_OVERLOAD_64_FCT _overload_to_extern_max_8x64 +#define _MAX_OVERLOAD_32_FCT _overload_to_extern_max_16x32 +#define _MIN_OVERLOAD_64_FCT _overload_to_extern_min_8x64 +#define _MIN_OVERLOAD_32_FCT _overload_to_extern_min_16x32 +#elif _XTEAM_NUM_THREADS == 256 +#define _SUM_OVERLOAD_64_FCT _overload_to_extern_sum_4x64 +#define _SUM_OVERLOAD_32_FCT _overload_to_extern_sum_8x32 +#define _MAX_OVERLOAD_64_FCT _overload_to_extern_max_4x64 +#define _MAX_OVERLOAD_32_FCT _overload_to_extern_max_8x32 +#define _MIN_OVERLOAD_64_FCT _overload_to_extern_min_4x64 +#define _MIN_OVERLOAD_32_FCT _overload_to_extern_min_8x32 +#else +#error Invalid value for _XTEAM_NUM_THREADS. Must be 1024, 512, or 256 +#endif + +// Question to Dhruva, should the limiter include the stride? +#if defined(__NVPTX__) && _XTEAM_NUM_THREADS == 1024 + // Cuda may restrict max threads when requesting 1024, so the bigjump +// on the inner loop depends on the actual limited number of threads +// determined by omp_get_num_threads(). It also requires we only call +// the helper reducer function when k is in this range. Lastly, the +// helper function must clear (set to rnv) unused xwave values +// before the optimized (unrolled) xwave reduction loop. See Xteamr.cpp. +// These three things kill performance on nvidia when requested thread=1024. +// So codegen max request of 512 threads (16x32) for nvidia GPUs. +#define _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(nteams) \ + if (k < (nteams * omp_get_num_threads())) +#ifdef __OPTIMIZE_INDEX0_STRIDE1 +#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \ + for (int64_t i = k; i < size; i += nteams * omp_get_num_threads()) +#else +#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \ + for (int64_t i = ((k * stride) + offset); i < size; \ + i += (nteams * omp_get_num_threads() * stride)) +#endif +#else + // Assume AMDGPU or NVIDIA=512|256 always gets requested number of + // threads. +// So no conditional needed to limit reductions. +#define _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(nteams) + +// Format of BIG_JUMP_LOOP depends on if we optimize for 0 index 1 stride +#if _XTEAM_NUM_THREADS == 1024 +#ifdef __OPTIMIZE_INDEX0_STRIDE1 +#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \ + for (int64_t i = k; i < size; i += nteams * 1024) +#else +#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \ + for (int64_t i = ((k * stride) + offset); i < size; \ + i += (nteams * 1024 * stride)) +#endif + +#elif _XTEAM_NUM_THREADS == 512 +#ifdef __OPTIMIZE_INDEX0_STRIDE1 +#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \ + for (int64_t i = k; i < size; i += nteams * 512) +#else +#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \ + for (int64_t i = ((k * stride) + offset); i < size; \ + i += (nteams * 512 * stride)) +#endif +#else +#ifdef __OPTIMIZE_INDEX0_STRIDE1 +#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \ + for (int64_t i = k; i < size; i += nteams * 256) +#else +#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \ + for (int64_t i = ((k * stride) + offset); i < size; \ + i += (nteams * 256 * stride)) +#endif +#endif // end if _XTEAM_NUM_THREADS == 1024, elif, else +#endif // if defined(__NVPTX__) && _XTEAM_NUM_THREADS == 1024 else + +unsigned int test_run_rc = 0; + +template void run_tests(const uint64_t); +template void run_tests_complex(const uint64_t); + +int main(int argc, char *argv[]) { + std::cout << std::endl + << "TEST DOUBLE " << _XTEAM_NUM_THREADS << " THREADS" << std::endl; + run_tests(ARRAY_SIZE); + std::cout << std::endl + << "TEST FLOAT " << _XTEAM_NUM_THREADS << " THREADS" << std::endl; + run_tests(ARRAY_SIZE); + std::cout << std::endl + << "TEST INT " << _XTEAM_NUM_THREADS << " THREADS" << std::endl; + run_tests(ARRAY_SIZE); + std::cout << std::endl + << "TEST UNSIGNED INT " << _XTEAM_NUM_THREADS << " THREADS" + << std::endl; + run_tests(ARRAY_SIZE); + std::cout << std::endl + << "TEST LONG " << _XTEAM_NUM_THREADS << " THREADS " << std::endl; + run_tests(ARRAY_SIZE); + std::cout << std::endl + << "TEST UNSIGNED LONG " << _XTEAM_NUM_THREADS << " THREADS" + << std::endl; + run_tests(ARRAY_SIZE); + std::cout << std::endl + << "TEST DOUBLE COMPLEX " << _XTEAM_NUM_THREADS << " THREADS" + << std::endl; + run_tests_complex(ARRAY_SIZE); + std::cout << std::endl + << "TEST FLOAT COMPLEX " << _XTEAM_NUM_THREADS << " THREADS" + << std::endl; + run_tests_complex(ARRAY_SIZE); + if (test_run_rc == 0) + printf("ALL TESTS PASSED\n"); + return test_run_rc; +} + +template T omp_dot(T *a, T *b, uint64_t array_size) { + T sum = 0.0; +#pragma omp target teams distribute parallel for map(tofrom: sum) reduction(+:sum) + for (int64_t i = 0; i < array_size; i++) + sum += a[i] * b[i]; + return sum; +} + +template T omp_max(T *c, uint64_t array_size) { + T maxval = std::numeric_limits::lowest(); +#pragma omp target teams distribute parallel for map(tofrom \ + : maxval) \ + reduction(max \ + : maxval) + for (int64_t i = 0; i < array_size; i++) + maxval = (c[i] > maxval) ? c[i] : maxval; + return maxval; +} + +template T omp_min(T *c, uint64_t array_size) { + T minval = std::numeric_limits::max(); +#pragma omp target teams distribute parallel for map(tofrom \ + : minval) \ + reduction(min \ + : minval) + for (int64_t i = 0; i < array_size; i++) { + minval = (c[i] < minval) ? c[i] : minval; + } + return minval; +} + +template T sim_dot(T *a, T *b, int warp_size) { + T sum = T(0); + int devid = 0; + struct loop_ctl_t { + uint32_t *td_ptr; // Atomic counter accessed on device + uint32_t reserved; // reserved + const int64_t stride = 1; // stride to process input vectors + const int64_t offset = 0; // Offset to initial index of input vectors + const int64_t size = _ARRAY_SIZE; // Size of input vector + const T rnv = T(0); // reduction null value + T *team_vals; // array of global team values + }; + static uint32_t zero = 0; + static loop_ctl_t lc0; + static int64_t num_teams0 = 0; + if (!num_teams0) { + // num_teams0 = ompx_get_device_num_units(devid); + num_teams0 = _XTEAM_NUM_TEAMS; + lc0.td_ptr = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid); + lc0.team_vals = (T *)omp_target_alloc(sizeof(T) * num_teams0, devid); + omp_target_memcpy(lc0.td_ptr, &zero, sizeof(uint32_t), 0, 0, devid, + omp_get_initial_device()); + } + + if (warp_size == 64) { +#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom:sum) map(to:lc0) + for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val0 = lc0.rnv; + _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset) + val0 += a[i] * b[i]; + _SUM_OVERLOAD_64_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv); + } + } else { +#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom:sum) map(to:lc0) + for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val0 = lc0.rnv; + _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset) + val0 += a[i] * b[i]; + _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) + _SUM_OVERLOAD_32_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv); + } + } + return sum; +} + +template T sim_max(T *c, int warp_size) { + T retval = std::numeric_limits::lowest(); + int devid = 0; + struct loop_ctl_t { + uint32_t *td_ptr; // Atomic counter accessed on device + uint32_t reserved; // reserved + const int64_t stride = 1; // stride to process input vectors + const int64_t offset = 0; // Offset to index of input vectors + const int64_t size = _ARRAY_SIZE; // Size of input vectors + T rnv; // reduction null value + T *team_vals; // array of global team values + }; + static uint32_t zero = 0; + static loop_ctl_t lc1; + static int64_t num_teams1 = 0; + if (!num_teams1) { + // num_teams1 = ompx_get_device_num_units(devid); + num_teams1 = _XTEAM_NUM_TEAMS; + lc1.td_ptr = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid); + lc1.rnv = std::numeric_limits::lowest(); + lc1.team_vals = (T *)omp_target_alloc(sizeof(T) * num_teams1, devid); + omp_target_memcpy(lc1.td_ptr, &zero, sizeof(uint32_t), 0, 0, devid, + omp_get_initial_device()); + } + if (warp_size == 64) { +#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom:retval) map(to:lc1) + for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val1 = lc1.rnv; + _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset) + val1 = (c[i] > val1) ? c[i] : val1; + _MAX_OVERLOAD_64_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv); + } + } else { +#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom:retval) map(to:lc1) + for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val1 = lc1.rnv; + _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset) + val1 = (c[i] > val1) ? c[i] : val1; + _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) + _MAX_OVERLOAD_32_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv); + } + } + return retval; +} + +template T sim_min(T *c, int warp_size) { + T retval = std::numeric_limits::max(); + int devid = 0; + struct loop_ctl_t { + uint32_t *td_ptr; // Atomic counter accessed on device + uint32_t reserved; // reserved + const int64_t stride = 1; // stride to process input vectors + const int64_t offset = 0; // Offset to initial index of input vectors + const int64_t size = _ARRAY_SIZE; // Size of input vectors + T rnv; // reduction null value + T *team_vals; // array of global team values + }; + static uint32_t zero = 0; + static loop_ctl_t lc2; + static int64_t num_teams2 = 0; + if (!num_teams2) { + // num_teams2 = ompx_get_device_num_units(devid); + num_teams2 = _XTEAM_NUM_TEAMS; + lc2.td_ptr = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid); + lc2.rnv = std::numeric_limits::max(); + lc2.team_vals = (T *)omp_target_alloc(sizeof(T) * num_teams2, devid); + omp_target_memcpy(lc2.td_ptr, &zero, sizeof(uint32_t), 0, 0, devid, + omp_get_initial_device()); + } + if (warp_size == 64) { +#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom:retval) map(to:lc2) + for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val2 = lc2.rnv; + _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset) + val2 = (c[i] < val2) ? c[i] : val2; + _MIN_OVERLOAD_64_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv); + } + } else { +#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom:retval) map(to:lc2) + for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val2 = lc2.rnv; + _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset) + val2 = (c[i] < val2) ? c[i] : val2; + _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) + _MIN_OVERLOAD_32_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv); + } + } + return retval; +} + +template +void _check_val(T computed_val, T gold_val, const char *msg) { + double ETOL = 0.0000001; + if (DATA_TYPE_IS_INT) { + if (computed_val != gold_val) { + std::cerr << msg << " FAIL " + << "Integar Value was " << computed_val << " but should be " + << gold_val << std::endl; + test_run_rc = 1; + } + } else { + double dcomputed_val = (double)computed_val; + double dvalgold = (double)gold_val; + double ompErrSum = abs((dcomputed_val - dvalgold) / dvalgold); + if (ompErrSum > ETOL) { + std::cerr << msg << " FAIL " << ompErrSum << " tol:" << ETOL << std::endl + << std::setprecision(15) << "Value was " << computed_val + << " but should be " << gold_val << std::endl; + test_run_rc = 1; + } + } +} + +#define ALIGNMENT (2 * 1024 * 1024) + +template +void run_tests(uint64_t array_size) { + + // FIXME: How do we get warpsize of a device from host? + int warp_size = 64; +#pragma omp target map(tofrom : warp_size) + warp_size = __kmpc_get_warp_size(); + + // Align on 2M boundaries + T *a = (T *)aligned_alloc(ALIGNMENT, sizeof(T) * array_size); + T *b = (T *)aligned_alloc(ALIGNMENT, sizeof(T) * array_size); + T *c = (T *)aligned_alloc(ALIGNMENT, sizeof(T) * array_size); +#pragma omp target enter data map(alloc:a[0:array_size], b[0:array_size], \ + c[0:array_size]) +#pragma omp target teams distribute parallel for + for (int64_t i = 0; i < array_size; i++) { + a[i] = 2; + b[i] = 3; + c[i] = (i + 1); + } + + std::cout << "Running kernels " << repeat_num_times << " times" << std::endl; + std::cout << "Ignoring timing of first " << ignore_times << " runs " + << std::endl; + + double ETOL = 0.0000001; + if (DATA_TYPE_IS_INT) { + std::cout << "Integer Size: " << sizeof(T) << std::endl; + } else { + if (sizeof(T) == sizeof(float)) + std::cout << "Precision: float" << std::endl; + else + std::cout << "Precision: double" << std::endl; + } + + std::cout << "Warp size:" << warp_size << std::endl; + // int num_teams = ompx_get_device_num_units(omp_get_default_device()); + int num_teams = _XTEAM_NUM_TEAMS; + std::cout << "Array elements: " << array_size << std::endl; + std::cout << "Array size: " << ((array_size * sizeof(T)) / (1024 * 1024)) + << " MB" << std::endl; + + T goldDot = (T)6 * (T)array_size; + T goldMax = (T)array_size; + T goldMin = (T)1; + + double goldDot_d = (double)goldDot; + double goldMax_d = (double)goldMax; + double goldMin_d = (double)goldMin; + + // List of times + std::vector> timings(6); + + // Declare timers + std::chrono::high_resolution_clock::time_point t1, t2; + + // Timing loop + for (unsigned int k = 0; k < repeat_num_times; k++) { + t1 = std::chrono::high_resolution_clock::now(); + T omp_sum = omp_dot(a, b, array_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[0].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val(omp_sum, goldDot, "omp_dot"); + + t1 = std::chrono::high_resolution_clock::now(); + T sim_sum = sim_dot(a, b, warp_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[1].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val(sim_sum, goldDot, "sim_dot"); + + t1 = std::chrono::high_resolution_clock::now(); + T omp_max_val = omp_max(c, array_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[2].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val(omp_max_val, goldMax, "omp_max"); + + t1 = std::chrono::high_resolution_clock::now(); + T sim_max_val = sim_max(c, warp_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[3].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val(sim_max_val, goldMax, "sim_max"); + + t1 = std::chrono::high_resolution_clock::now(); + T omp_min_val = omp_min(c, array_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[4].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val(omp_min_val, goldMin, "omp_min"); + + t1 = std::chrono::high_resolution_clock::now(); + T sim_min_val = sim_min(c, warp_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[5].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val(sim_min_val, goldMin, "sim_min"); + + } // end Timing loop + + // Display timing results + std::cout << std::left << std::setw(12) << "Function" << std::left + << std::setw(12) << "Best-MB/sec" << std::left << std::setw(12) + << " Min (sec)" << std::left << std::setw(12) << " Max" + << std::left << std::setw(12) << "Average" << std::left + << std::setw(12) << "Avg-MB/sec" << std::endl; + + std::cout << std::fixed; + + std::string labels[6] = {"ompdot", "simdot", "ompmax", + "simmax", "ompmin", "simmin"}; + size_t sizes[6] = {2 * sizeof(T) * array_size, 2 * sizeof(T) * array_size, + 1 * sizeof(T) * array_size, 1 * sizeof(T) * array_size, + 1 * sizeof(T) * array_size, 1 * sizeof(T) * array_size}; + + for (int i = 0; i < 6; i++) { + // Get min/max; ignore the first couple results + auto minmax = std::minmax_element(timings[i].begin() + ignore_times, + timings[i].end()); + // Calculate average; ignore ignore_times + double average = std::accumulate(timings[i].begin() + ignore_times, + timings[i].end(), 0.0) / + (double)(repeat_num_times - ignore_times); + printf(" %s %8.0f %8.6f %8.6f %8.6f %8.0f\n", + labels[i].c_str(), 1.0E-6 * sizes[i] / (*minmax.first), + (double)*minmax.first, (double)*minmax.second, (double)average, + 1.0E-6 * sizes[i] / (average)); + } +#pragma omp target exit data map(release: a[0:array_size], b[0:array_size], \ + c[0:array_size]) + free(a); + free(b); + free(c); +} + + +template +void _check_val_complex(TC computed_val_complex, TC gold_val_complex, + const char *msg) { + T gold_val_r = __real__(gold_val_complex); + T computed_val_r = __real__(computed_val_complex); + T gold_val_i = __imag__(gold_val_complex); + T computed_val_i = __imag__(computed_val_complex); + double ETOL = 0.0000001; + double computed_val_r_d = (double)computed_val_r; + double valgold_r_d = (double)gold_val_r; + double ompErrSum_r = abs((computed_val_r_d - valgold_r_d) / valgold_r_d); + double computed_val_i_d = (double)computed_val_i; + double valgold_i_d = (double)gold_val_i; + double ompErrSum_i = abs((computed_val_i_d - valgold_i_d) / valgold_i_d); + if ((ompErrSum_r > ETOL) || (ompErrSum_i > ETOL)) { + std::cerr << msg << " FAIL " << ompErrSum_r << " tol:" << ETOL << std::endl + << std::setprecision(15) << "Value was (" << computed_val_r + << " + " << computed_val_i << " i )" << std::endl + << " but should be (" << gold_val_r << " + " << gold_val_i + << "i) " << std::endl; + test_run_rc = 1; + } +} + +template TC omp_dot_complex(TC *a, TC *b, uint64_t array_size) { + TC dot; + __real__(dot) = 0.0; + __imag__(dot) = 0.0; +#pragma omp target teams distribute parallel for map(tofrom: dot) reduction(+:dot) + for (int64_t i = 0; i < array_size; i++) + dot += a[i] * b[i]; + return dot; +} + +template T sim_dot_complex(T *a, T *b, int warp_size) { + int devid = 0; + T zero_c; + __real__(zero_c) = 0.0; + __imag__(zero_c) = 0.0; + struct loop_ctl_t { + uint32_t *td_ptr; // Atomic counter accessed on device + uint32_t reserved; // reserved + const int64_t stride = 1; // stride to process input vectors + const int64_t offset = 0; // Offset to initial index of input vectors + const int64_t size = _ARRAY_SIZE; // Size of input vectors + T rnv; // reduction null value + T *team_vals; // array of global team values + }; + T sum = zero_c; + uint32_t zero = 0; + static loop_ctl_t lc3; + static int64_t num_teams3 = 0; + if (!num_teams3) { + // num_teams3 = ompx_get_device_num_units(devid); + num_teams3 = _XTEAM_NUM_TEAMS; + lc3.td_ptr = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid); + lc3.team_vals = (T *)omp_target_alloc(sizeof(T) * num_teams3, devid); + lc3.rnv = zero_c; + omp_target_memcpy(lc3.td_ptr, &zero, sizeof(uint32_t), 0, 0, devid, + omp_get_initial_device()); + } + + if (warp_size == 64) { +#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom \ + : sum) map(to \ + : lc3) + for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val3 = lc3.rnv; + _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc3.size, lc3.stride, lc3.offset) + val3 += a[i] * b[i]; + _SUM_OVERLOAD_64_FCT(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv); + } + } else { +#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom \ + : sum) map(to \ + : lc3) + for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val3 = lc3.rnv; + _BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc3.size, lc3.stride, lc3.offset) + val3 += a[i] * b[i]; + _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS) + _SUM_OVERLOAD_32_FCT(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv); + } + } + return sum; +} + +template +void run_tests_complex(const uint64_t array_size) { + + // FIXME: How do we get warpsize of a device from host? + int warp_size = 64; +#pragma omp target map(tofrom : warp_size) + warp_size = __kmpc_get_warp_size(); + + TC *a = (TC *)aligned_alloc(ALIGNMENT, sizeof(TC) * array_size); + TC *b = (TC *)aligned_alloc(ALIGNMENT, sizeof(TC) * array_size); + +#pragma omp target enter data map(alloc : a [0:array_size], b [0:array_size]) + TC startA; + __real__(startA) = 1.0; + __imag__(startA) = 1.0; + TC startB; + __real__(startB) = 1.0; + __imag__(startB) = -1.0; + +#pragma omp target teams distribute parallel for + for (int64_t i = 0; i < array_size; i++) { + a[i] = startA; + b[i] = startB; + // a[i] * b[i] = 2 + 0i + } + + std::cout << "Running kernels " << repeat_num_times << " times" << std::endl; + std::cout << "Ignoring timing of first " << ignore_times << " runs " + << std::endl; + + double ETOL = 0.0000001; + if (sizeof(TC) == sizeof(float _Complex)) + std::cout << "Precision: float _Complex" << std::endl; + else + std::cout << "Precision: double _Complex" << std::endl; + + std::cout << "Warp size:" << warp_size << std::endl; + std::cout << "Array elements: " << array_size << std::endl; + std::cout << "Array size: " << ((array_size * sizeof(TC)) / (1024 * 1024)) + << " MB" << std::endl; + + T goldDotr = T(2) * (T)array_size; + T goldDoti = T(0); + + TC goldDot; + __real__(goldDot) = goldDotr; + __imag__(goldDot) = goldDoti; + + // List of times + std::vector> timings(2); + + // Declare timers + std::chrono::high_resolution_clock::time_point t1, t2; + + // timing loop + for (unsigned int k = 0; k < repeat_num_times; k++) { + t1 = std::chrono::high_resolution_clock::now(); + TC omp_sum = omp_dot_complex(a, b, array_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[0].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val_complex(omp_sum, goldDot, "omp_dot"); + + t1 = std::chrono::high_resolution_clock::now(); + TC sim_sum = sim_dot_complex(a, b, warp_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[1].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val_complex(sim_sum, goldDot, "sim_dot"); + + } // end timing loop + + // Display timing results + std::cout << std::left << std::setw(12) << "Function" << std::left + << std::setw(12) << "Best-MB/sec" << std::left << std::setw(12) + << " Min (sec)" << std::left << std::setw(12) << " Max" + << std::left << std::setw(12) << "Average" << std::left + << std::setw(12) << "Avg-MB/sec" << std::endl; + + std::cout << std::fixed; + + std::string labels[2] = {"ompdot", "simdot"}; + size_t sizes[2] = {2 * sizeof(TC) * array_size, 2 * sizeof(TC) * array_size}; + + for (int i = 0; i < 2; i++) { + // Get min/max; ignore the first couple results + auto minmax = std::minmax_element(timings[i].begin() + ignore_times, + timings[i].end()); + + // Calculate average; ignore ignore_times + double average = std::accumulate(timings[i].begin() + ignore_times, + timings[i].end(), 0.0) / + (double)(repeat_num_times - ignore_times); + + printf(" %s %8.0f %8.6f %8.6f %8.6f %8.0f\n", + labels[i].c_str(), 1.0E-6 * sizes[i] / (*minmax.first), + (double)*minmax.first, (double)*minmax.second, (double)average, + 1.0E-6 * sizes[i] / (average)); + } +#pragma omp target exit data map(release : a [0:array_size], b [0:array_size]) + free(a); + free(b); +} diff --git a/openmp/libomptarget/test/xteamr/test_xteamr.sh b/openmp/libomptarget/test/xteamr/test_xteamr.sh new file mode 100755 --- /dev/null +++ b/openmp/libomptarget/test/xteamr/test_xteamr.sh @@ -0,0 +1,67 @@ +#!/bin/bash +#== overload_insts_1024.h overloaded instatiations of Xteamr fcts -C++ -*-===// +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===----------------------------------------------------------------------===// +# +# test_xteamr.sh: Script to test high performance reduction helper functions +# in llvm-project/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp +# It compiles and executes test_xteamr.cpp in 3 configs. +# 1024 device threads, 512 dev threads, and 256 dev threads. +# +# See README file in this directory for more information. +# +#===----------------------------------------------------------------------===// + +LLVM_INSTALL=${LLVM_INSTALL:-$HOME/llvm} +[ ! -f $LLVM_INSTALL/bin/clang ] && echo "ERROR: no LLVM install at $LLVM_INSTALL" && exit 1 + +OFFLOAD_ARCH=${OFFLOAD_ARCH:-sm_70} + +tmpdir=/tmp/$USER/xteamr && mkdir -p $tmpdir +[ ! -d $tmpdir ] && echo "ERROR: could not create $tmpdir" + +ARRAY_SIZE=${ARRAY_SIZE:-41943040} +#ARRAY_SIZE=${ARRAY_SIZE:-33554432} +as_arg="-D_ARRAY_SIZE=$ARRAY_SIZE" + +NUM_TEAMS=${NUM_TEAMS:-80} + +cuda_args="" +CUDA_INSTALL=${CUDA_INSTALL:-/usr/local/cuda} +[ -d $CUDA_INSTALL ] && cudalib=$CUDA_INSTALL/targets/x86_64-linux/lib && export LD_LIBRARY_PATH=$cudalib && cuda_args="-L$cudalib -lcudart" + +nt_args="-D_XTEAM_NUM_THREADS=1024 -D_XTEAM_NUM_TEAMS=$NUM_TEAMS" +echo " COMPILE with --offload-arch=$OFFLOAD_ARCH $as_arg $nt_args" +$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg $nt_args -fopenmp --offload-arch=$OFFLOAD_ARCH test_xteamr.cpp -o $tmpdir/xteamr_1024 $cuda_args -lstdc++ -latomic +rc1=$? + +nt_args="-D_XTEAM_NUM_THREADS=512 -D_XTEAM_NUM_TEAMS=$NUM_TEAMS" +echo " COMPILE with --offload-arch=$OFFLOAD_ARCH $as_arg $nt_args" +$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg $nt_args -fopenmp --offload-arch=$OFFLOAD_ARCH test_xteamr.cpp -o $tmpdir/xteamr_512 $cuda_args -lstdc++ -latomic +rc2=$? + +nt_args="-D_XTEAM_NUM_THREADS=256 -D_XTEAM_NUM_TEAMS=$NUM_TEAMS" +echo " COMPILE with --offload-arch=$OFFLOAD_ARCH $as_arg $nt_args" +$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg $nt_args -fopenmp --offload-arch=$OFFLOAD_ARCH test_xteamr.cpp -o $tmpdir/xteamr_256 $cuda_args -lstdc++ -latomic +rc3=$? + +[ $rc1 == 0 ] && echo "START EXECUTE xteamr_1024" && $tmpdir/xteamr_1024 > $tmpdir/xteamr_1024.out +rc4=$? +[ $rc2 == 0 ] && echo "START EXECUTE xteamr_512" && $tmpdir/xteamr_512 > $tmpdir/xteamr_512.out +rc5=$? +[ $rc3 == 0 ] && echo "START EXECUTE xteamr_256" && $tmpdir/xteamr_256 > $tmpdir/xteamr_256.out +rc6=$? + +echo +rc=$(( $rc1 + $rc2 + $rc3 + $rc4 + $rc5 + $rc6 )) +if [ $rc != 0 ] ; then + echo "ERRORS DETECTED!" +else + echo "No errors detected" +fi +echo "Logs and binaries saved to $tmpdir" +exit $rc