diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -113,6 +113,8 @@ ${source_directory}/Tasking.cpp ${source_directory}/Utils.cpp ${source_directory}/Workshare.cpp + ${source_directory}/Xteamr.cpp + ${source_directory}/Rfuns.cpp ) set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512) diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -239,6 +239,320 @@ void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct); +/// +/// Helper functions for high performance cross team reductions. +/// +/// Name decoder: __kmpc_xteamr__x +/// number of warps/waves in team +/// 32 or 64 +/// example: __kmpc_xteam_f_16x64 is the helper function for data +/// type float with fixed teamsize of 1024 threads. +/// There are 48(6 X 8) helper functions. +/// 6 configurations of teamsize are currently created. +/// The most performant configs use max teamsize 1024: 16x64 and 32x32. +/// Currently the Other configs are 8x64, 4x64, 16x32, and 8x32. +/// 8 data types available: double, float, int, uint, long, ulong, +/// double _Complex, and float _complex +/// All xteam helper functions have these 7 args: +/// arg1: the thread local reduction value +/// arg2: pointer to where result is written +/// arg3: global array of team values for this reduction instance +/// arg4: atomic counter of completed teams for this reduction instance +/// arg5: void function pointer of pair reduction function, +/// (e.g. sum(&a,b),min(&a,b),max(&a,b) +/// arg6: equivalent (to arg5) void function pointer of pair reduction +/// function on LDS memory, (e.g. sum(&a,&b),min(&a,&b),max(&a,&b) +/// arg7: Initializing value for the reduction type +/// +#define _RF_LDS volatile __attribute__((address_space(3))) +void __kmpc_xteamr_d_16x64(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv); +void __kmpc_xteamr_f_16x64(float v, float *r_ptr, float *tvals, + uint32_t *td_ptr, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv); +void __kmpc_xteamr_cd_16x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv); +void __kmpc_xteamr_cf_16x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv); +void __kmpc_xteamr_i_16x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv); +void __kmpc_xteamr_ui_16x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv); +void __kmpc_xteamr_l_16x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv); +void __kmpc_xteamr_ul_16x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv); +void __kmpc_xteamr_d_8x64(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv); +void __kmpc_xteamr_f_8x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv); +void __kmpc_xteamr_cd_8x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv); +void __kmpc_xteamr_cf_8x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv); +void __kmpc_xteamr_i_8x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv); +void __kmpc_xteamr_ui_8x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv); +void __kmpc_xteamr_l_8x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv); +void __kmpc_xteamr_ul_8x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv); +void __kmpc_xteamr_d_4x64(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv); +void __kmpc_xteamr_f_4x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv); +void __kmpc_xteamr_cd_4x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv); +void __kmpc_xteamr_cf_4x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv); +void __kmpc_xteamr_i_4x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv); +void __kmpc_xteamr_ui_4x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv); +void __kmpc_xteamr_l_4x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv); +void __kmpc_xteamr_ul_4x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv); +void __kmpc_xteamr_d_32x32(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv); +void __kmpc_xteamr_f_32x32(float v, float *r_ptr, float *tvals, + uint32_t *td_ptr, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv); +void __kmpc_xteamr_cd_32x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv); +void __kmpc_xteamr_cf_32x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv); + +void __kmpc_xteamr_i_32x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv); + +void __kmpc_xteamr_ui_32x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv); +void __kmpc_xteamr_l_32x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv); +void __kmpc_xteamr_ul_32x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv); +void __kmpc_xteamr_d_16x32(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv); +void __kmpc_xteamr_f_16x32(float v, float *r_ptr, float *tvals, + uint32_t *td_ptr, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv); +void __kmpc_xteamr_cd_16x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv); +void __kmpc_xteamr_cf_16x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv); + +void __kmpc_xteamr_i_16x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv); +void __kmpc_xteamr_ui_16x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv); +void __kmpc_xteamr_l_16x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv); +void __kmpc_xteamr_ul_16x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv); +void __kmpc_xteamr_d_8x32(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv); +void __kmpc_xteamr_f_8x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv); +void __kmpc_xteamr_cd_8x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv); +void __kmpc_xteamr_cf_8x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv); +void __kmpc_xteamr_i_8x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv); +void __kmpc_xteamr_ui_8x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv); +void __kmpc_xteamr_l_8x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv); +void __kmpc_xteamr_ul_8x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv); + +/// Builtin pair reduction functions. +/// These become function pointers for arg5 and arg6 of xteamr above. +/// Each pair reduction function must have two variants to support xteamr. +/// The 1st is for TLS memory and the 2nd is for LDS (scratchpad) memory. +/// These are defined in Rfuns.cpp. User defined reductions require +/// that Clang codegen generate these functions. +void __kmpc_rfun_sum_d(double *val, double otherval); +void __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval); +void __kmpc_rfun_sum_f(float *val, float otherval); +void __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval); +void __kmpc_rfun_sum_cd(double _Complex *val, double _Complex otherval); +void __kmpc_rfun_sum_lds_cd(_RF_LDS double _Complex *val, + _RF_LDS double _Complex *otherval); +void __kmpc_rfun_sum_cf(float _Complex *val, float _Complex otherval); +void __kmpc_rfun_sum_lds_cf(_RF_LDS float _Complex *val, + _RF_LDS float _Complex *otherval); +void __kmpc_rfun_sum_i(int *val, int otherval); +void __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval); +void __kmpc_rfun_sum_ui(unsigned int *val, unsigned int otherval); +void __kmpc_rfun_sum_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval); +void __kmpc_rfun_sum_l(long *val, long otherval); +void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); +void __kmpc_rfun_sum_ul(unsigned long *val, unsigned long otherval); +void __kmpc_rfun_sum_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval); +/// Complex variables have no compare, so no min or max for cf and cd. +void __kmpc_rfun_min_d(double *val, double otherval); +void __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval); +void __kmpc_rfun_min_f(float *val, float otherval); +void __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval); +void __kmpc_rfun_min_i(int *val, int otherval); +void __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval); +void __kmpc_rfun_min_ui(unsigned int *val, unsigned int otherval); +void __kmpc_rfun_min_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval); +void __kmpc_rfun_min_l(long *val, long otherval); +void __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); +void __kmpc_rfun_min_ul(unsigned long *val, unsigned long otherval); +void __kmpc_rfun_min_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval); +void __kmpc_rfun_max_d(double *val, double otherval); +void __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval); +void __kmpc_rfun_max_f(float *val, float otherval); +void __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval); +void __kmpc_rfun_max_i(int *val, int otherval); +void __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval); +void __kmpc_rfun_max_ui(unsigned int *val, unsigned int otherval); +void __kmpc_rfun_max_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval); +void __kmpc_rfun_max_l(long *val, long otherval); +void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); +void __kmpc_rfun_max_ul(unsigned long *val, unsigned long otherval); +void __kmpc_rfun_max_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval); +#undef _RF_LDS ///} /// Synchronization diff --git a/openmp/libomptarget/DeviceRTL/src/Rfuns.cpp b/openmp/libomptarget/DeviceRTL/src/Rfuns.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Rfuns.cpp @@ -0,0 +1,145 @@ +//===---- Rfuns.cpp - OpenMP reduction functions ---- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains simple reduction functions used as function pointers to +// reduction helper functions for cross team reductions defined in Xteamr.cpp +// +//===----------------------------------------------------------------------===// + +#pragma omp declare target + +#define _RF_ATTR extern "C" __attribute__((flatten, always_inline)) void +#define _RF_LDS volatile __attribute__((address_space(3))) + +_RF_ATTR __kmpc_rfun_sum_d(double *val, double otherval) { *val += otherval; } +_RF_ATTR __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) { + *val += *otherval; +} +_RF_ATTR __kmpc_rfun_sum_f(float *val, float otherval) { *val += otherval; } +_RF_ATTR __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) { + *val += *otherval; +} +_RF_ATTR __kmpc_rfun_sum_cd(double _Complex *val, double _Complex otherval) { + *val += otherval; +} +_RF_ATTR __kmpc_rfun_sum_lds_cd(_RF_LDS double _Complex *val, + _RF_LDS double _Complex *otherval) { + *val += *otherval; +} +_RF_ATTR __kmpc_rfun_sum_cf(float _Complex *val, float _Complex otherval) { + *val += otherval; +} +_RF_ATTR __kmpc_rfun_sum_lds_cf(_RF_LDS float _Complex *val, + _RF_LDS float _Complex *otherval) { + *val += *otherval; +} +_RF_ATTR __kmpc_rfun_sum_i(int *val, int otherval) { *val += otherval; } +_RF_ATTR __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) { + *val += *otherval; +} +_RF_ATTR __kmpc_rfun_sum_ui(unsigned int *val, unsigned int otherval) { + *val += otherval; +} +_RF_ATTR __kmpc_rfun_sum_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval) { + *val += *otherval; +} +_RF_ATTR __kmpc_rfun_sum_l(long *val, long otherval) { *val += otherval; } +_RF_ATTR __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) { + *val += *otherval; +} +_RF_ATTR __kmpc_rfun_sum_ul(unsigned long *val, unsigned long otherval) { + *val += otherval; +} +_RF_ATTR __kmpc_rfun_sum_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval) { + *val += *otherval; +} + +_RF_ATTR __kmpc_rfun_min_d(double *val, double otherval) { + *val = (otherval < *val) ? otherval : *val; +} +_RF_ATTR __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) { + *val = (*otherval < *val) ? *otherval : *val; +} +_RF_ATTR __kmpc_rfun_min_f(float *val, float otherval) { + *val = (otherval < *val) ? otherval : *val; +} +_RF_ATTR __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) { + *val = (*otherval < *val) ? *otherval : *val; +} +_RF_ATTR __kmpc_rfun_min_i(int *val, int otherval) { + *val = (otherval < *val) ? otherval : *val; +} +_RF_ATTR __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) { + *val = (*otherval < *val) ? *otherval : *val; +} +_RF_ATTR __kmpc_rfun_min_ui(unsigned int *val, unsigned int otherval) { + *val = (otherval < *val) ? otherval : *val; +} +_RF_ATTR __kmpc_rfun_min_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval) { + *val = (*otherval < *val) ? *otherval : *val; +} +_RF_ATTR __kmpc_rfun_min_l(long *val, long otherval) { + *val = (otherval < *val) ? otherval : *val; +} +_RF_ATTR __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) { + *val = (*otherval < *val) ? *otherval : *val; +} +_RF_ATTR __kmpc_rfun_min_ul(unsigned long *val, unsigned long otherval) { + *val = (otherval < *val) ? otherval : *val; +} +_RF_ATTR __kmpc_rfun_min_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval) { + *val = (*otherval < *val) ? *otherval : *val; +} + +_RF_ATTR __kmpc_rfun_max_d(double *val, double otherval) { + *val = (otherval > *val) ? otherval : *val; +} +_RF_ATTR __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) { + *val = (*otherval > *val) ? *otherval : *val; +} +_RF_ATTR __kmpc_rfun_max_f(float *val, float otherval) { + *val = (otherval > *val) ? otherval : *val; +} +_RF_ATTR __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) { + *val = (*otherval > *val) ? *otherval : *val; +} +_RF_ATTR __kmpc_rfun_max_i(int *val, int otherval) { + *val = (otherval > *val) ? otherval : *val; +} +_RF_ATTR __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) { + *val = (*otherval > *val) ? *otherval : *val; +} +_RF_ATTR __kmpc_rfun_max_ui(unsigned int *val, unsigned int otherval) { + *val = (otherval > *val) ? otherval : *val; +} +_RF_ATTR __kmpc_rfun_max_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval) { + *val = (*otherval > *val) ? *otherval : *val; +} +_RF_ATTR __kmpc_rfun_max_l(long *val, long otherval) { + *val = (otherval > *val) ? otherval : *val; +} +_RF_ATTR __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) { + *val = (*otherval > *val) ? *otherval : *val; +} +_RF_ATTR __kmpc_rfun_max_ul(unsigned long *val, unsigned long otherval) { + *val = (otherval > *val) ? otherval : *val; +} +_RF_ATTR __kmpc_rfun_max_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval) { + *val = (*otherval > *val) ? *otherval : *val; +} + +#undef _RF_ATTR +#undef _RF_LDS + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp b/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp @@ -0,0 +1,704 @@ +//===---- Xteamr.cpp - OpenMP cross team helper functions ---- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains helper functions for cross team reductions +// +//===----------------------------------------------------------------------===// + +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "State.h" +#include "Synchronization.h" +#include "Types.h" +#include "Utils.h" + +#define __XTEAM_SHARED_LDS volatile __attribute__((address_space(3))) + +using namespace _OMP; + +#pragma omp declare target +// Headers for specialized shfl_xor +double xteamr_shfl_xor_d(double var, const int lane_mask, const uint32_t width); +float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width); +int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width); +double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask, + const uint32_t width); +float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask, + const uint32_t width); + +// Define the arch (amdgcn vs nvptx) variants of shfl + +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) { + int self = mapping::getThreadIdInWarp(); // __lane_id(); + int index = self ^ lane_mask; + index = index >= ((self + width) & ~(width - 1)) ? self : index; + return __builtin_amdgcn_ds_bpermute(index << 2, var); +} +float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width) { + union { + int i; + unsigned u; + float f; + } tmp; + tmp.f = var; + tmp.i = xteamr_shfl_xor_int(tmp.i, lane_mask, width); + return tmp.f; +} +double xteamr_shfl_xor_d(double var, const int lane_mask, + const uint32_t width) { + static_assert(sizeof(double) == 2 * sizeof(int), ""); + static_assert(sizeof(double) == sizeof(uint64_t), ""); + + int tmp[2]; + __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = xteamr_shfl_xor_int(tmp[0], lane_mask, width); + tmp[1] = xteamr_shfl_xor_int(tmp[1], lane_mask, width); + + uint64_t tmp0 = + (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + double tmp1; + __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} + +double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask, + const uint32_t width) { + __real__(var) = xteamr_shfl_xor_d(__real__(var), lane_mask, width); + __imag__(var) = xteamr_shfl_xor_d(__imag__(var), lane_mask, width); + return var; +} +float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask, + const uint32_t width) { + __real__(var) = xteamr_shfl_xor_f(__real__(var), lane_mask, width); + __imag__(var) = xteamr_shfl_xor_f(__imag__(var), lane_mask, width); + return var; +} +#pragma omp end declare variant + +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) { + int c = ((32 - width) << 8) | 0x1f; + return __nvvm_shfl_sync_bfly_i32(0xFFFFFFFF, var, lane_mask, c); +} +float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width) { + union { + int i; + unsigned u; + float f; + } tmp; + tmp.f = var; + tmp.i = xteamr_shfl_xor_int(tmp.i, lane_mask, width); + return tmp.f; +} +double xteamr_shfl_xor_d(double var, int laneMask, const uint32_t width) { + unsigned lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var)); + hi = xteamr_shfl_xor_int(hi, laneMask, width); + lo = xteamr_shfl_xor_int(lo, laneMask, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi)); + return var; +} +double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask, + const uint32_t width) { + __real__(var) = xteamr_shfl_xor_d(__real__(var), lane_mask, width); + __imag__(var) = xteamr_shfl_xor_d(__imag__(var), lane_mask, width); + return var; +} +float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask, + const uint32_t width) { + __real__(var) = xteamr_shfl_xor_f(__real__(var), lane_mask, width); + __imag__(var) = xteamr_shfl_xor_f(__imag__(var), lane_mask, width); + return var; +} +#pragma omp end declare variant +// } // end impl namespace + +// tag dispatching of type specific shfl_xor, get_low, and get_high +struct _d_tag {}; +struct _f_tag {}; +struct _cd_tag {}; +struct _cf_tag {}; +struct _i_tag {}; +struct _ui_tag {}; +struct _l_tag {}; +struct _ul_tag {}; +template struct __dispatch_tag; +template <> struct __dispatch_tag { + typedef _d_tag type; +}; +template <> struct __dispatch_tag { + typedef _f_tag type; +}; +template <> struct __dispatch_tag { + typedef _cd_tag type; +}; +template <> struct __dispatch_tag { + typedef _cf_tag type; +}; +template <> struct __dispatch_tag { + typedef _i_tag type; +}; +template <> struct __dispatch_tag { + typedef _ui_tag type; +}; +template <> struct __dispatch_tag { + typedef _l_tag type; +}; +template <> struct __dispatch_tag { + typedef _ul_tag type; +}; +template +double xteamr_shfl_xor(_d_tag tag, double var, const int lane_mask) { + return xteamr_shfl_xor_d(var, lane_mask, _WSZ); +} +template +float xteamr_shfl_xor(_f_tag tag, float var, const int lane_mask) { + return xteamr_shfl_xor_f(var, lane_mask, _WSZ); +} +template +double _Complex xteamr_shfl_xor(_cd_tag tag, double _Complex var, + const int lane_mask) { + return xteamr_shfl_xor_cd(var, lane_mask, _WSZ); +} +template +float _Complex xteamr_shfl_xor(_cf_tag tag, float _Complex var, + const int lane_mask) { + return xteamr_shfl_xor_cf(var, lane_mask, _WSZ); +} +template +int xteamr_shfl_xor(_i_tag tag, int var, const int lane_mask) { + return xteamr_shfl_xor_int(var, lane_mask, _WSZ); +} +template +unsigned int xteamr_shfl_xor(_ui_tag tag, unsigned int var, + const int lane_mask) { + return xteamr_shfl_xor_int(var, lane_mask, _WSZ); +} +template +long xteamr_shfl_xor(_l_tag tag, long var, const int lane_mask) { + return xteamr_shfl_xor_d(var, lane_mask, _WSZ); +} +template +unsigned long xteamr_shfl_xor(_ul_tag tag, unsigned long var, + const int lane_mask) { + return xteamr_shfl_xor_d(var, lane_mask, _WSZ); +} + +template +T xteamr_shfl_xor(T var, const int lane_mask) { + typedef typename __dispatch_tag::type tag; + return xteamr_shfl_xor<_WSZ>(tag(), var, lane_mask); +} + +template +__attribute__((flatten, always_inline)) void _xteam_reduction( + T val, T *r_ptr, T *team_vals, uint32_t *teams_done_ptr, + void (*_rf)(T *, T), + void (*_rf_lds)(__XTEAM_SHARED_LDS T *, __XTEAM_SHARED_LDS T *), + const T inival) { + + constexpr int32_t _NT = _NW * _WSZ; + const uint32_t omp_thread_num = mapping::getThreadIdInBlock(); + const uint32_t omp_team_num = mapping::getBlockId(); + const uint32_t wave_num = mapping::getWarpId(); // 0 15 + const uint32_t lane_num = mapping::getThreadIdInWarp(); // 0 63 + const uint32_t NumTeams = mapping::getNumberOfBlocks(); + static __XTEAM_SHARED_LDS T xwave_lds[_NW]; + static __XTEAM_SHARED_LDS bool __is_last_team; + + // Binary reduce each wave, then copy to xwave_lds[wave_num] + for (unsigned int offset = _WSZ / 2; offset > 0; offset >>= 1) + (*_rf)(&val, xteamr_shfl_xor(val, offset)); + if (lane_num == 0) + xwave_lds[wave_num] = val; + + // Binary reduce each wave's value into wave_lds[0] with lds memory. + _OMP::synchronize::threads(); + for (unsigned int offset = _NW / 2; offset > 0; offset >>= 1) { + if (omp_thread_num < offset) + (*_rf_lds)(&(xwave_lds[omp_thread_num]), + &(xwave_lds[omp_thread_num + offset])); + } + _OMP::synchronize::threads(); + + // Discover the last team to complete cross wave reduction + // The team number of last team is nondeterministic. + __is_last_team = false; + if (omp_thread_num == 0) { + team_vals[omp_team_num] = xwave_lds[0]; + uint32_t td = atomic::inc(teams_done_ptr, NumTeams - 1u, __ATOMIC_SEQ_CST); + if (td == (NumTeams - 1u)) + __is_last_team = true; + } + + // This sync needed, so all threads from last team know they are in the last + // team. + _OMP::synchronize::threads(); + + if (__is_last_team) { + // All threads from last completed team enter here. + // All other teams exit. + if (omp_thread_num < NumTeams) + val = team_vals[omp_thread_num]; + else + val = inival; + + // Reduce each wave into xwave_lds[wave_num] + for (unsigned int offset = _WSZ / 2; offset > 0; offset >>= 1) + (*_rf)(&val, xteamr_shfl_xor(val, offset)); + if (lane_num == 0) + xwave_lds[wave_num] = val; + + // For final result, inside next if clause, We know wave_lds[0] is done + // But we need a sync here to ensure wave_lds[i!=0] is correct. + _OMP::synchronize::threads(); + + // Typically only 2 useable waves when <128 CUs. No gain to parallelizing + // these last 2 reductions. So do these on thread 0 into lane 0's val. + if (omp_thread_num == 0) { + unsigned int usableWaves = ((NumTeams - 1) / _WSZ) + 1; + for (unsigned int kk = 1; kk < usableWaves; kk++) + (*_rf_lds)(&xwave_lds[0], &xwave_lds[kk]); + + // Reduce with the original result value. + xwave_lds[1] = *r_ptr; + (*_rf_lds)(&xwave_lds[0], &xwave_lds[1]); + *r_ptr = xwave_lds[0]; + } + } +} + +// Calls to these __kmpc extern C functions are created in clang codegen +// for FORTRAN, c, and C++. They may also be used for sumulation and testing. +// The headers for these extern C functions are in ../include/Interface.h +// The compiler builds the name based on data type, +// number of waves in the team,and warpsize. +// +#define _EXT_ATTR extern "C" __attribute__((flatten, always_inline)) void +_EXT_ATTR +__kmpc_xteamr_d_16x64(double v, double *r_ptr, double *tvals, uint32_t *td_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(__XTEAM_SHARED_LDS double *, + __XTEAM_SHARED_LDS double *), + const double iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_f_16x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(__XTEAM_SHARED_LDS float *, + __XTEAM_SHARED_LDS float *), + const float iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +_EXT_ATTR +__kmpc_xteamr_cd_16x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *, + __XTEAM_SHARED_LDS double _Complex *), + double _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cf_16x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *, + __XTEAM_SHARED_LDS float _Complex *), + float _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_i_16x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(__XTEAM_SHARED_LDS int *, + __XTEAM_SHARED_LDS int *), + const int iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ui_16x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *, + __XTEAM_SHARED_LDS uint32_t *), + const uint32_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_l_16x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(__XTEAM_SHARED_LDS long *, + __XTEAM_SHARED_LDS long *), + const long iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ul_16x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *, + __XTEAM_SHARED_LDS uint64_t *), + const uint64_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +_EXT_ATTR +__kmpc_xteamr_d_8x64(double v, double *r_ptr, double *tvals, uint32_t *td_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(__XTEAM_SHARED_LDS double *, + __XTEAM_SHARED_LDS double *), + const double iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_f_8x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(__XTEAM_SHARED_LDS float *, + __XTEAM_SHARED_LDS float *), + const float iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cd_8x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *, + __XTEAM_SHARED_LDS double _Complex *), + const double _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cf_8x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *, + __XTEAM_SHARED_LDS float _Complex *), + const float _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, + iv); +} +_EXT_ATTR +__kmpc_xteamr_i_8x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(__XTEAM_SHARED_LDS int *, + __XTEAM_SHARED_LDS int *), + const int iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ui_8x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *, + __XTEAM_SHARED_LDS uint32_t *), + const uint32_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_l_8x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(__XTEAM_SHARED_LDS long *, + __XTEAM_SHARED_LDS long *), + const long iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ul_8x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *, + __XTEAM_SHARED_LDS uint64_t *), + const uint64_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +_EXT_ATTR +__kmpc_xteamr_d_4x64(double v, double *r_ptr, double *tvals, uint32_t *td_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(__XTEAM_SHARED_LDS double *, + __XTEAM_SHARED_LDS double *), + const double iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_f_4x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(__XTEAM_SHARED_LDS float *, + __XTEAM_SHARED_LDS float *), + const float iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cd_4x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *, + __XTEAM_SHARED_LDS double _Complex *), + const double _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cf_4x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *, + __XTEAM_SHARED_LDS float _Complex *), + const float _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, + iv); +} +_EXT_ATTR +__kmpc_xteamr_i_4x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(__XTEAM_SHARED_LDS int *, + __XTEAM_SHARED_LDS int *), + const int iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ui_4x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *, + __XTEAM_SHARED_LDS uint32_t *), + const uint32_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_l_4x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(__XTEAM_SHARED_LDS long *, + __XTEAM_SHARED_LDS long *), + const long iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ul_4x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *, + __XTEAM_SHARED_LDS uint64_t *), + const uint64_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +_EXT_ATTR +__kmpc_xteamr_d_32x32(double v, double *r_ptr, double *tvals, uint32_t *td_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(__XTEAM_SHARED_LDS double *, + __XTEAM_SHARED_LDS double *), + const double iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_f_32x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(__XTEAM_SHARED_LDS float *, + __XTEAM_SHARED_LDS float *), + const float iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cd_32x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *, + __XTEAM_SHARED_LDS double _Complex *), + const double _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cf_32x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *, + __XTEAM_SHARED_LDS float _Complex *), + const float _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_i_32x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(__XTEAM_SHARED_LDS int *, + __XTEAM_SHARED_LDS int *), + const int iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ui_32x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *, + __XTEAM_SHARED_LDS uint32_t *), + const uint32_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_l_32x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(__XTEAM_SHARED_LDS long *, + __XTEAM_SHARED_LDS long *), + const long iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ul_32x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *, + __XTEAM_SHARED_LDS uint64_t *), + const uint64_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +_EXT_ATTR +__kmpc_xteamr_d_16x32(double v, double *r_ptr, double *tvals, uint32_t *td_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(__XTEAM_SHARED_LDS double *, + __XTEAM_SHARED_LDS double *), + const double iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_f_16x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(__XTEAM_SHARED_LDS float *, + __XTEAM_SHARED_LDS float *), + const float iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cd_16x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *, + __XTEAM_SHARED_LDS double _Complex *), + const double _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cf_16x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *, + __XTEAM_SHARED_LDS float _Complex *), + const float _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_i_16x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(__XTEAM_SHARED_LDS int *, + __XTEAM_SHARED_LDS int *), + const int iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ui_16x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *, + __XTEAM_SHARED_LDS uint32_t *), + const uint32_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_l_16x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(__XTEAM_SHARED_LDS long *, + __XTEAM_SHARED_LDS long *), + const long iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ul_16x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *, + __XTEAM_SHARED_LDS uint64_t *), + const uint64_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} + +_EXT_ATTR +__kmpc_xteamr_d_8x32(double v, double *r_ptr, double *tvals, uint32_t *td_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(__XTEAM_SHARED_LDS double *, + __XTEAM_SHARED_LDS double *), + const double iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_f_8x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(__XTEAM_SHARED_LDS float *, + __XTEAM_SHARED_LDS float *), + const float iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cd_8x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS double _Complex *, + __XTEAM_SHARED_LDS double _Complex *), + const double _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, + _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_cf_8x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(__XTEAM_SHARED_LDS float _Complex *, + __XTEAM_SHARED_LDS float _Complex *), + const float _Complex iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, + iv); +} +_EXT_ATTR +__kmpc_xteamr_i_8x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(__XTEAM_SHARED_LDS int *, + __XTEAM_SHARED_LDS int *), + const int iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ui_8x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint32_t *, + __XTEAM_SHARED_LDS uint32_t *), + const uint32_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_l_8x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(__XTEAM_SHARED_LDS long *, + __XTEAM_SHARED_LDS long *), + const long iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +_EXT_ATTR +__kmpc_xteamr_ul_8x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(__XTEAM_SHARED_LDS uint64_t *, + __XTEAM_SHARED_LDS uint64_t *), + const uint64_t iv) { + _xteam_reduction(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv); +} +#undef _EXT_ATTR + +#pragma omp end declare target diff --git a/openmp/libomptarget/test/xteamr/README b/openmp/libomptarget/test/xteamr/README new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/xteamr/README @@ -0,0 +1,78 @@ +//===------- README for llvm-project/openmp/libomptarget/test/xteamr ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +This xteamr test executes simulated reductions using the high performance +DeviceRTL xteam helper functions and compares to the existing OpenMP reduction. + +The goal of this smoke test is to ensure that the DeviceRTL helper functions +in Xteamr.cpp are correct and performant. This test (xteamr) will fail +(return a nonzero return code) if either the openmp reduction or the simulated + reduction get the wrong answer. + +The simulated functions sim_dot, sim_max, and sim_min show what clang would +generate when it encounters an omp target pragma with one or more sum +reductions. See example below. + +The clang CodeGen changes are not yet complete. + +The interfaces to the helper functions that clang codegen is expected to generate +are contained in this test source. For building DeviceRTL the interfaces and +the documentation can be found in: + + llvm-project/openmp/libomptarget/DeviceRTL/include/Interface.h + +The definitions of all xteamr functions can be found in + + llvm-project/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp + +The builtin simple pair reducing functions can be found here: + + llvm-project/openmp/libomptarget/DeviceRTL/src/Rfuns.cpp + +EXAMPLE: Given this code: + + double sum = 0.0; + #pragma omp target teams distribute parallel for map(tofrom: sum) reduction(+:sum) + for (int64_t i = 0; i < array_size; i++) + sum += a[i] * b[i]; + +Clang should generate code equivalent to this. + + #define _NUM_THREADS 1024 + devid = 0 ; // or default or whatever is on target construct + TEAM_PROCS = ompx_get_team_procs(devid); + team_vals0 = (double *) omp_target_alloc(sizeof(double) * TEAM_PROCS, devid); + teams_done_ptr0 = (uint32_t *) omp_target_alloc(sizeof(uint32_t),devid); + uint32_t zero = 0; + omp_target_memcpy(teams_done_ptr0, &zero , sizeof(uint32_t), 0, + 0, devid, omp_get_initial_device()); + #pragma omp target teams distribute parallel for \ + num_teams(TEAM_PROCS) num_threads(_NUM_THREADS) \ + map(tofrom:sum) is_device_ptr(team_vals0,teams_done_ptr0) + for (unsigned int k=0; k<(LOOP_TEAMS*_NUM_THREADS) ; k++) { + T val0 = 0.0; + LOOP_STRIDE=1; LOOP_START=0; LOOP_SIZE = array_size; + for (int64_t i = ((k * LOOP_STRIDE) + LOOP_START); i < LOOP_SIZE; + i += (TEAM_PROCS * _NUM_THREADS * LOOP_STRIDE)) + { + val0 += a[i] * b[i]; + } + __kmpc_xteamr_d_16x64(val0, &sum, team_vals0, teams_done_ptr0, + __kmpc_rfun_sum_d, __kmpc_rfun_sum_lds_d,0.0); + } + +If processor has 32 bit warp_size change __kmpc_xteamr_d_16x64 to __kmpc_xteamr_d_32x32 +The test script compile_run tests all configs of 1024, 512, and 256 threads +For very large arrays, the best performance is with 1024 threads. +If array_size is constant and known to be less than TEAM_PROCS*_NUM_THREADS, +the codegen may uses a smaller configuration of either 512 or 256 threads. + +To compile and execute +export LLVM_INSTALL= +export OFFLOAD_ARCH=gfx90a +./compile_run diff --git a/openmp/libomptarget/test/xteamr/header b/openmp/libomptarget/test/xteamr/header new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/xteamr/header @@ -0,0 +1,9 @@ +//== overload_insts_1024.h overloaded instatiations of Xteamr fcts -C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file contains all instantiations of functions using 1024 threads +//===----------------------------------------------------------------------===// diff --git a/openmp/libomptarget/test/xteamr/overload_insts_1024.h b/openmp/libomptarget/test/xteamr/overload_insts_1024.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/xteamr/overload_insts_1024.h @@ -0,0 +1,279 @@ +//== overload_insts_1024.h overloaded instatiations of Xteamr fcts -C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file contains all instantiations of functions using 1024 threads +//===----------------------------------------------------------------------===// + +// ---- Local overloads for testing sum +void __attribute__((flatten, always_inline)) +sum_overloaded_16x64(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_d, + __kmpc_rfun_sum_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x64(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_f, + __kmpc_rfun_sum_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x64(double _Complex val, double _Complex *rval, + double _Complex *xteam_mem, unsigned int *td_ptr, + const double _Complex initval) { + __kmpc_xteamr_cd_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cd, + __kmpc_rfun_sum_lds_cd, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x64(float _Complex val, float _Complex *rval, + float _Complex *xteam_mem, unsigned int *td_ptr, + const float _Complex initval) { + __kmpc_xteamr_cf_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cf, + __kmpc_rfun_sum_lds_cf, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_i, + __kmpc_rfun_sum_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x64(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ui, + __kmpc_rfun_sum_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x64(long val, long *rval, long *xteam_mem, + unsigned int *td_ptr, const long initval) { + __kmpc_xteamr_l_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_l, + __kmpc_rfun_sum_lds_l, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x64(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ul, + __kmpc_rfun_sum_lds_ul, initval); +} + +// ---- Local overloads for testing max +void __attribute__((flatten, always_inline)) +max_overloaded_16x64(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_d, + __kmpc_rfun_max_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_16x64(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_f, + __kmpc_rfun_max_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_16x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_i, + __kmpc_rfun_max_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_16x64(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ui, + __kmpc_rfun_max_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_16x64(long val, long *rval, long *xteam_mem, + unsigned int *td_ptr, const long initval) { + __kmpc_xteamr_l_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_l, + __kmpc_rfun_max_lds_l, initval); +} + +void __attribute__((flatten, always_inline)) +max_overloaded_16x64(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ul, + __kmpc_rfun_max_lds_ul, initval); +} + +// ---- Local overloads for testing min +void __attribute__((flatten, always_inline)) +min_overloaded_16x64(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_d, + __kmpc_rfun_min_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_16x64(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_f, + __kmpc_rfun_min_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_16x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_i, + __kmpc_rfun_min_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_16x64(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ui, + __kmpc_rfun_min_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_16x64(long val, long *rval, long *xteam_mem, + unsigned int *td_ptr, const long initval) { + __kmpc_xteamr_l_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_l, + __kmpc_rfun_min_lds_l, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_16x64(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_16x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ul, + __kmpc_rfun_min_lds_ul, initval); +} + +// ---- Local overloads for testing sum with 32 item warps +void __attribute__((flatten, always_inline)) +sum_overloaded_32x32(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_d, + __kmpc_rfun_sum_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_32x32(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_f, + __kmpc_rfun_sum_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_32x32(double _Complex val, double _Complex *rval, + double _Complex *xteam_mem, unsigned int *td_ptr, + const double _Complex initval) { + __kmpc_xteamr_cd_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cd, + __kmpc_rfun_sum_lds_cd, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_32x32(float _Complex val, float _Complex *rval, + float _Complex *xteam_mem, unsigned int *td_ptr, + const float _Complex initval) { + __kmpc_xteamr_cf_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cf, + __kmpc_rfun_sum_lds_cf, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_32x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_i, + __kmpc_rfun_sum_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_32x32(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ui, + __kmpc_rfun_sum_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_32x32(long val, long *rval, long *xteam_mem, + unsigned int *td_ptr, const long initval) { + __kmpc_xteamr_l_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_l, + __kmpc_rfun_sum_lds_l, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_32x32(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ul, + __kmpc_rfun_sum_lds_ul, initval); +} + +// ---- Local overloads for testing max +void __attribute__((flatten, always_inline)) +max_overloaded_32x32(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_d, + __kmpc_rfun_max_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_32x32(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_f, + __kmpc_rfun_max_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_32x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_i, + __kmpc_rfun_max_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_32x32(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ui, + __kmpc_rfun_max_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_32x32(long val, long *rval, long *xteam_mem, + unsigned int *td_ptr, const long initval) { + __kmpc_xteamr_l_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_l, + __kmpc_rfun_max_lds_l, initval); +} + +void __attribute__((flatten, always_inline)) +max_overloaded_32x32(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ul, + __kmpc_rfun_max_lds_ul, initval); +} + +// ---- Local overloads for testing min +void __attribute__((flatten, always_inline)) +min_overloaded_32x32(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_d, + __kmpc_rfun_min_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_32x32(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_f, + __kmpc_rfun_min_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_32x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_i, + __kmpc_rfun_min_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_32x32(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ui, + __kmpc_rfun_min_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_32x32(long val, long *rval, long *xteam_mem, + unsigned int *td_ptr, const long initval) { + __kmpc_xteamr_l_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_l, + __kmpc_rfun_min_lds_l, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_32x32(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_32x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ul, + __kmpc_rfun_min_lds_ul, initval); +} diff --git a/openmp/libomptarget/test/xteamr/overload_insts_256.h b/openmp/libomptarget/test/xteamr/overload_insts_256.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/xteamr/overload_insts_256.h @@ -0,0 +1,279 @@ +//=== overload_insts_256.h overloaded instatiations of Xteamr fcts -C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file contains all instantiations of functions using 256 threads +//===----------------------------------------------------------------------===// + +// ---- Local overloads for testing sum +void __attribute__((flatten, always_inline)) +sum_overloaded_4x64(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_d, + __kmpc_rfun_sum_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_4x64(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_f, + __kmpc_rfun_sum_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_4x64(double _Complex val, double _Complex *rval, + double _Complex *xteam_mem, unsigned int *td_ptr, + const double _Complex initval) { + __kmpc_xteamr_cd_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cd, + __kmpc_rfun_sum_lds_cd, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_4x64(float _Complex val, float _Complex *rval, + float _Complex *xteam_mem, unsigned int *td_ptr, + const float _Complex initval) { + __kmpc_xteamr_cf_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cf, + __kmpc_rfun_sum_lds_cf, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_4x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_i, + __kmpc_rfun_sum_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_4x64(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ui, + __kmpc_rfun_sum_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_4x64(long val, long *rval, long *xteam_mem, unsigned int *td_ptr, + const long initval) { + __kmpc_xteamr_l_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_l, + __kmpc_rfun_sum_lds_l, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_4x64(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ul, + __kmpc_rfun_sum_lds_ul, initval); +} + +// ---- Local overloads for testing max +void __attribute__((flatten, always_inline)) +max_overloaded_4x64(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_d, + __kmpc_rfun_max_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_4x64(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_f, + __kmpc_rfun_max_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_4x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_i, + __kmpc_rfun_max_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_4x64(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ui, + __kmpc_rfun_max_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_4x64(long val, long *rval, long *xteam_mem, unsigned int *td_ptr, + const long initval) { + __kmpc_xteamr_l_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_l, + __kmpc_rfun_max_lds_l, initval); +} + +void __attribute__((flatten, always_inline)) +max_overloaded_4x64(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ul, + __kmpc_rfun_max_lds_ul, initval); +} + +// ---- Local overloads for testing min +void __attribute__((flatten, always_inline)) +min_overloaded_4x64(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_d, + __kmpc_rfun_min_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_4x64(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_f, + __kmpc_rfun_min_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_4x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_i, + __kmpc_rfun_min_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_4x64(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ui, + __kmpc_rfun_min_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_4x64(long val, long *rval, long *xteam_mem, unsigned int *td_ptr, + const long initval) { + __kmpc_xteamr_l_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_l, + __kmpc_rfun_min_lds_l, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_4x64(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_4x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ul, + __kmpc_rfun_min_lds_ul, initval); +} + +// ---- Local overloads for testing sum with 32 item warps +void __attribute__((flatten, always_inline)) +sum_overloaded_8x32(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_d, + __kmpc_rfun_sum_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x32(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_f, + __kmpc_rfun_sum_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x32(double _Complex val, double _Complex *rval, + double _Complex *xteam_mem, unsigned int *td_ptr, + const double _Complex initval) { + __kmpc_xteamr_cd_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cd, + __kmpc_rfun_sum_lds_cd, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x32(float _Complex val, float _Complex *rval, + float _Complex *xteam_mem, unsigned int *td_ptr, + const float _Complex initval) { + __kmpc_xteamr_cf_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cf, + __kmpc_rfun_sum_lds_cf, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_i, + __kmpc_rfun_sum_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x32(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ui, + __kmpc_rfun_sum_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x32(long val, long *rval, long *xteam_mem, unsigned int *td_ptr, + const long initval) { + __kmpc_xteamr_l_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_l, + __kmpc_rfun_sum_lds_l, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x32(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ul, + __kmpc_rfun_sum_lds_ul, initval); +} + +// ---- Local overloads for testing max +void __attribute__((flatten, always_inline)) +max_overloaded_8x32(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_d, + __kmpc_rfun_max_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_8x32(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_f, + __kmpc_rfun_max_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_8x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_i, + __kmpc_rfun_max_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_8x32(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ui, + __kmpc_rfun_max_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_8x32(long val, long *rval, long *xteam_mem, unsigned int *td_ptr, + const long initval) { + __kmpc_xteamr_l_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_l, + __kmpc_rfun_max_lds_l, initval); +} + +void __attribute__((flatten, always_inline)) +max_overloaded_8x32(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ul, + __kmpc_rfun_max_lds_ul, initval); +} + +// ---- Local overloads for testing min +void __attribute__((flatten, always_inline)) +min_overloaded_8x32(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_d, + __kmpc_rfun_min_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_8x32(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_f, + __kmpc_rfun_min_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_8x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_i, + __kmpc_rfun_min_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_8x32(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ui, + __kmpc_rfun_min_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_8x32(long val, long *rval, long *xteam_mem, unsigned int *td_ptr, + const long initval) { + __kmpc_xteamr_l_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_l, + __kmpc_rfun_min_lds_l, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_8x32(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_8x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ul, + __kmpc_rfun_min_lds_ul, initval); +} diff --git a/openmp/libomptarget/test/xteamr/overload_insts_512.h b/openmp/libomptarget/test/xteamr/overload_insts_512.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/xteamr/overload_insts_512.h @@ -0,0 +1,278 @@ +//=== overload_insts_512.h overloaded instatiations of Xteamr fcts -C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file contains all instantiations of functions using 512 threads +//===----------------------------------------------------------------------===// +// ---- Local overloads for testing sum +void __attribute__((flatten, always_inline)) +sum_overloaded_8x64(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_d, + __kmpc_rfun_sum_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x64(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_f, + __kmpc_rfun_sum_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x64(double _Complex val, double _Complex *rval, + double _Complex *xteam_mem, unsigned int *td_ptr, + const double _Complex initval) { + __kmpc_xteamr_cd_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cd, + __kmpc_rfun_sum_lds_cd, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x64(float _Complex val, float _Complex *rval, + float _Complex *xteam_mem, unsigned int *td_ptr, + const float _Complex initval) { + __kmpc_xteamr_cf_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cf, + __kmpc_rfun_sum_lds_cf, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_i, + __kmpc_rfun_sum_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x64(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ui, + __kmpc_rfun_sum_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x64(long val, long *rval, long *xteam_mem, unsigned int *td_ptr, + const long initval) { + __kmpc_xteamr_l_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_l, + __kmpc_rfun_sum_lds_l, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_8x64(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ul, + __kmpc_rfun_sum_lds_ul, initval); +} + +// ---- Local overloads for testing max +void __attribute__((flatten, always_inline)) +max_overloaded_8x64(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_d, + __kmpc_rfun_max_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_8x64(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_f, + __kmpc_rfun_max_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_8x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_i, + __kmpc_rfun_max_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_8x64(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ui, + __kmpc_rfun_max_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_8x64(long val, long *rval, long *xteam_mem, unsigned int *td_ptr, + const long initval) { + __kmpc_xteamr_l_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_l, + __kmpc_rfun_max_lds_l, initval); +} + +void __attribute__((flatten, always_inline)) +max_overloaded_8x64(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ul, + __kmpc_rfun_max_lds_ul, initval); +} + +// ---- Local overloads for testing min +void __attribute__((flatten, always_inline)) +min_overloaded_8x64(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_d, + __kmpc_rfun_min_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_8x64(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_f, + __kmpc_rfun_min_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_8x64(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_i, + __kmpc_rfun_min_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_8x64(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ui, + __kmpc_rfun_min_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_8x64(long val, long *rval, long *xteam_mem, unsigned int *td_ptr, + const long initval) { + __kmpc_xteamr_l_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_l, + __kmpc_rfun_min_lds_l, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_8x64(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_8x64(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ul, + __kmpc_rfun_min_lds_ul, initval); +} + +// ---- Local overloads for testing sum with 32 item warps +void __attribute__((flatten, always_inline)) +sum_overloaded_16x32(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_d, + __kmpc_rfun_sum_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x32(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_f, + __kmpc_rfun_sum_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x32(double _Complex val, double _Complex *rval, + double _Complex *xteam_mem, unsigned int *td_ptr, + const double _Complex initval) { + __kmpc_xteamr_cd_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cd, + __kmpc_rfun_sum_lds_cd, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x32(float _Complex val, float _Complex *rval, + float _Complex *xteam_mem, unsigned int *td_ptr, + const float _Complex initval) { + __kmpc_xteamr_cf_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_cf, + __kmpc_rfun_sum_lds_cf, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_i, + __kmpc_rfun_sum_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x32(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ui, + __kmpc_rfun_sum_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x32(long val, long *rval, long *xteam_mem, + unsigned int *td_ptr, const long initval) { + __kmpc_xteamr_l_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_l, + __kmpc_rfun_sum_lds_l, initval); +} +void __attribute__((flatten, always_inline)) +sum_overloaded_16x32(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_sum_ul, + __kmpc_rfun_sum_lds_ul, initval); +} + +// ---- Local overloads for testing max +void __attribute__((flatten, always_inline)) +max_overloaded_16x32(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_d, + __kmpc_rfun_max_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_16x32(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_f, + __kmpc_rfun_max_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_16x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_i, + __kmpc_rfun_max_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_16x32(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ui, + __kmpc_rfun_max_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +max_overloaded_16x32(long val, long *rval, long *xteam_mem, + unsigned int *td_ptr, const long initval) { + __kmpc_xteamr_l_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_l, + __kmpc_rfun_max_lds_l, initval); +} + +void __attribute__((flatten, always_inline)) +max_overloaded_16x32(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_max_ul, + __kmpc_rfun_max_lds_ul, initval); +} + +// ---- Local overloads for testing min +void __attribute__((flatten, always_inline)) +min_overloaded_16x32(double val, double *rval, double *xteam_mem, + unsigned int *td_ptr, const double initval) { + __kmpc_xteamr_d_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_d, + __kmpc_rfun_min_lds_d, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_16x32(float val, float *rval, float *xteam_mem, + unsigned int *td_ptr, const float initval) { + __kmpc_xteamr_f_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_f, + __kmpc_rfun_min_lds_f, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_16x32(int val, int *rval, int *xteam_mem, unsigned int *td_ptr, + const int initval) { + __kmpc_xteamr_i_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_i, + __kmpc_rfun_min_lds_i, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_16x32(unsigned int val, unsigned int *rval, + unsigned int *xteam_mem, unsigned int *td_ptr, + const unsigned int initval) { + __kmpc_xteamr_ui_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ui, + __kmpc_rfun_min_lds_ui, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_16x32(long val, long *rval, long *xteam_mem, + unsigned int *td_ptr, const long initval) { + __kmpc_xteamr_l_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_l, + __kmpc_rfun_min_lds_l, initval); +} +void __attribute__((flatten, always_inline)) +min_overloaded_16x32(unsigned long val, unsigned long *rval, + unsigned long *xteam_mem, unsigned int *td_ptr, + const unsigned long initval) { + __kmpc_xteamr_ul_16x32(val, rval, xteam_mem, td_ptr, __kmpc_rfun_min_ul, + __kmpc_rfun_min_lds_ul, initval); +} diff --git a/openmp/libomptarget/test/xteamr/test_xteamr.sh b/openmp/libomptarget/test/xteamr/test_xteamr.sh new file mode 100755 --- /dev/null +++ b/openmp/libomptarget/test/xteamr/test_xteamr.sh @@ -0,0 +1,63 @@ +#!/bin/bash +#== overload_insts_1024.h overloaded instatiations of Xteamr fcts -C++ -*-===// +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===----------------------------------------------------------------------===// +# +# test_xteamr.sh: Script to test high performance reduction helper functions +# in llvm-project/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp +# It compiles and executes xteamr.cpp in 3 different configs. +# +# See README file in this directory for more information. +# +#===----------------------------------------------------------------------===// + +LLVM_INSTALL=${LLVM_INSTALL:-$HOME/llvm} +[ ! -f $LLVM_INSTALL/bin/clang ] && echo "ERROR: no LLVM install at $LLVM_INSTALL" && exit 1 + +OFFLOAD_ARCH=${OFFLOAD_ARCH:-sm_70} + +tmpdir=/tmp/$USER/xteamr && mkdir -p $tmpdir +[ ! -d $tmpdir ] && echo "ERROR: could not create $tmpdir" + +ARRAY_SIZE=${ARRAY_SIZE:-41943040} +as_arg="-D_ARRAY_SIZE=$ARRAY_SIZE" + +cuda_args="" +CUDA_INSTALL=${CUDA_INSTALL:-/usr/local/cuda} +[ -d $CUDA_INSTALL ] && cudalib=$CUDA_INSTALL/targets/x86_64-linux/lib && export LD_LIBRARY_PATH=$cudalib && cuda_args="-L$cudalib -lcudart" + +nt_arg="-D_XTEAM_NUM_THREADS=1024" +echo compile with $as_arg and $nt_arg +$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg $nt_arg -fopenmp --offload-arch=$OFFLOAD_ARCH xteamr.cpp -o $tmpdir/xteamr_1024 $cuda_args -lstdc++ -latomic +rc1=$? + +nt_arg="-D_XTEAM_NUM_THREADS=512" +echo compile with $as_arg and $nt_arg +$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg -fopenmp --offload-arch=$OFFLOAD_ARCH xteamr.cpp -o $tmpdir/xteamr_512 $cuda_args -lstdc++ -latomic +rc2=$? + +nt_arg="-D_XTEAM_NUM_THREADS=256" +echo compile with $as_arg and $nt_arg +$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg -fopenmp --offload-arch=$OFFLOAD_ARCH xteamr.cpp -o $tmpdir/xteamr_256 $cuda_args -lstdc++ -latomic +rc3=$? + +[ $rc1 == 0 ] && echo "START EXECUTE xteamr_1024" && $tmpdir/xteamr_1024 > $tmpdir/xteamr_1024.out +rc4=$? +[ $rc2 == 0 ] && echo "START EXECUTE xteamr_512" && $tmpdir/xteamr_512 > $tmpdir/xteamr_512.out +rc5=$? +[ $rc3 == 0 ] && echo "START EXECUTE xteamr_256" && $tmpdir/xteamr_256 > $tmpdir/xteamr_256.out +rc6=$? + +echo +rc=$(( $rc1 + $rc2 + $rc3 + $rc4 + $rc5 + $rc6 )) +if [ $rc != 0 ] ; then + echo "ERRORS DETECTED!" +else + echo "No errors detected" +fi +echo "Logs and binaries saved to $tmpdir" +exit $rc diff --git a/openmp/libomptarget/test/xteamr/xteamr.cpp b/openmp/libomptarget/test/xteamr/xteamr.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/xteamr/xteamr.cpp @@ -0,0 +1,680 @@ +//===--------- xteamr.cpp - Test for Xteamr DeviceRTL functions ----C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// performance and functional tests for Xteamr reduction helper functions in +// libomptarget/DeviceRTL/Xteamr.cpp +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Extern Xteamr functions are designed for 1024, 512, and 256 thread blocks. +// To change _XTEAM_NUM_TREADS to 512, change "16x64" names to "8x64" names, +// and change "32x32" names to "16x32" names. +// For 256 threads, change "16x64" names to "4x64", and "32x32" to "8x32". + +#ifndef _XTEAM_NUM_THREADS +#define _XTEAM_NUM_THREADS 1024 +#endif + +#if _XTEAM_NUM_THREADS == 1024 +#define _SUM_OVERLOAD_64_FCT sum_overloaded_16x64 +#define _SUM_OVERLOAD_32_FCT sum_overloaded_32x32 +#define _MAX_OVERLOAD_64_FCT max_overloaded_16x64 +#define _MAX_OVERLOAD_32_FCT max_overloaded_32x32 +#define _MIN_OVERLOAD_64_FCT min_overloaded_16x64 +#define _MIN_OVERLOAD_32_FCT min_overloaded_32x32 +#define _OVERLOADED_INSTANTIATIONS "overload_insts_1024.h" +#elif _XTEAM_NUM_THREADS == 512 +#define _SUM_OVERLOAD_64_FCT sum_overloaded_8x64 +#define _SUM_OVERLOAD_32_FCT sum_overloaded_16x32 +#define _MAX_OVERLOAD_64_FCT max_overloaded_8x64 +#define _MAX_OVERLOAD_32_FCT max_overloaded_16x32 +#define _MIN_OVERLOAD_64_FCT min_overloaded_8x64 +#define _MIN_OVERLOAD_32_FCT min_overloaded_16x32 +#define _OVERLOADED_INSTANTIATIONS "overload_insts_512.h" +#elif _XTEAM_NUM_THREADS == 256 +#define _SUM_OVERLOAD_64_FCT sum_overloaded_4x64 +#define _SUM_OVERLOAD_32_FCT sum_overloaded_8x32 +#define _MAX_OVERLOAD_64_FCT max_overloaded_4x64 +#define _MAX_OVERLOAD_32_FCT max_overloaded_8x32 +#define _MIN_OVERLOAD_64_FCT min_overloaded_4x64 +#define _MIN_OVERLOAD_32_FCT min_overloaded_8x32 +#define _OVERLOADED_INSTANTIATIONS "overload_insts_256.h" +#else +#error Invalid value for _XTEAM_NUM_THREADS. Must be 1024, 512, or 256 +#endif + +#ifndef _ARRAY_SIZE +#define _ARRAY_SIZE 335544320 +#endif +const uint64_t ARRAY_SIZE = _ARRAY_SIZE; +unsigned int repeat_num_times = 12; +unsigned int ignore_times = 2; // ignore this many timings first + +unsigned int test_run_rc = 0; + +template void run_tests(const uint64_t); +template void run_tests_complex(const uint64_t); + +int main(int argc, char *argv[]) { + std::cout << std::endl + << "TEST DOUBLE " << _XTEAM_NUM_THREADS << " THREADS" << std::endl; + run_tests(ARRAY_SIZE); + std::cout << std::endl + << "TEST FLOAT " << _XTEAM_NUM_THREADS << " THREADS" << std::endl; + run_tests(ARRAY_SIZE); + std::cout << std::endl + << "TEST INT " << _XTEAM_NUM_THREADS << " THREADS" << std::endl; + run_tests(ARRAY_SIZE); + std::cout << std::endl + << "TEST UNSIGNED INT " << _XTEAM_NUM_THREADS << " THREADS" + << std::endl; + run_tests(ARRAY_SIZE); + std::cout << std::endl + << "TEST LONG " << _XTEAM_NUM_THREADS << " THREADS " << std::endl; + run_tests(ARRAY_SIZE); + std::cout << std::endl + << "TEST UNSIGNED LONG " << _XTEAM_NUM_THREADS << " THREADS" + << std::endl; + run_tests(ARRAY_SIZE); +#if 0 + // FIXME: _Complex is hanging in last team logic of _xteam_reduction + // Hint, sizeof(double _Complex) is 16 bytes + std::cout << std::endl << "TEST FLOAT COMPLEX " + << _XTEAM_NUM_THREADS << " THREADS" << std::endl; + run_tests_complex(ARRAY_SIZE); + std::cout << std::endl << "TEST DOUBLE COMPLEX " + << _XTEAM_NUM_THREADS << " THREADS" << std::endl; + run_tests_complex(ARRAY_SIZE); +#endif + return test_run_rc; +} + +#include "xteamr_host_dev.h" +#include _OVERLOADED_INSTANTIATIONS + +template T omp_dot(T *a, T *b, uint64_t array_size) { + T sum = 0.0; +#pragma omp target teams distribute parallel for map(tofrom: sum) reduction(+:sum) + for (int64_t i = 0; i < array_size; i++) + sum += a[i] * b[i]; + return sum; +} + +template T omp_max(T *c, uint64_t array_size) { + T maxval = std::numeric_limits::lowest(); +#pragma omp target teams distribute parallel for map(tofrom \ + : maxval) \ + reduction(max \ + : maxval) + for (int64_t i = 0; i < array_size; i++) + maxval = (c[i] > maxval) ? c[i] : maxval; + return maxval; +} + +template T omp_min(T *c, uint64_t array_size) { + T minval = std::numeric_limits::max(); +#pragma omp target teams distribute parallel for map(tofrom \ + : minval) \ + reduction(min \ + : minval) + for (int64_t i = 0; i < array_size; i++) { + minval = (c[i] < minval) ? c[i] : minval; + } + return minval; +} + +#define _INNER_LOOP \ + for (int64_t i = ((k * LOOP_STRIDE) + LOOP_START); i < LOOP_SIZE; \ + i += (LOOP_TEAMS * _XTEAM_NUM_THREADS * LOOP_STRIDE)) + +template +T sim_dot(T *a, T *b, uint64_t array_size, int warp_size) { + T sum = T(0); + int devid = 0; + static uint32_t *teams_done_ptr0 = nullptr; + static uint32_t *d_teams_done_ptr0; + static T *d_team_vals0; + static uint32_t team_procs0; + if (!teams_done_ptr0) { + // One-time alloc device array for each teams's reduction value. + team_procs0 = ompx_get_team_procs(devid); + d_team_vals0 = (T *)omp_target_alloc(sizeof(T) * team_procs0, devid); + // Allocate and copy the zero-initialized teams_done counter one time + // because it atomically resets when last team increments it. + teams_done_ptr0 = (uint32_t *)malloc(sizeof(uint32_t)); + *teams_done_ptr0 = 0; + d_teams_done_ptr0 = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid); + omp_target_memcpy(d_teams_done_ptr0, teams_done_ptr0, sizeof(uint32_t), 0, + 0, devid, omp_get_initial_device()); + } + // Making the array_size 64 bits avoids a data_submit and data_retrieve + const uint64_t LOOP_TEAMS = team_procs0; + const uint64_t LOOP_SIZE = (uint64_t)array_size; + if (warp_size == 64) { +#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom \ + : sum) \ + is_device_ptr(d_team_vals0, d_teams_done_ptr0) + for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val0 = T(0); + constexpr int LOOP_START = 0; + constexpr uint32_t LOOP_STRIDE = 1; + _INNER_LOOP { val0 += a[i] * b[i]; } + _SUM_OVERLOAD_64_FCT(val0, &sum, d_team_vals0, d_teams_done_ptr0, T(0)); + } + } else { +#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom \ + : sum) \ + is_device_ptr(d_team_vals0, d_teams_done_ptr0) + for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val0 = T(0); + constexpr int LOOP_START = 0; + constexpr uint32_t LOOP_STRIDE = 1; + _INNER_LOOP { val0 += a[i] * b[i]; } + _SUM_OVERLOAD_32_FCT(val0, &sum, d_team_vals0, d_teams_done_ptr0, T(0)); + } + } + return sum; +} + +template T sim_max(T *c, uint64_t array_size, int warp_size) { + int devid = 0; + const T minval = std::numeric_limits::lowest(); + T retval = minval; + static uint32_t *teams_done_ptr1 = nullptr; + static uint32_t *d_teams_done_ptr1; + static T *d_team_vals1; + static uint32_t team_procs1; + if (!teams_done_ptr1) { + // One-time alloc device array for each teams's reduction value. + team_procs1 = ompx_get_team_procs(devid); + d_team_vals1 = (T *)omp_target_alloc(sizeof(T) * team_procs1, devid); + // Allocate and copy the zero-initialized teams_done counter one time + // because it atomically resets when last team increments it. + // Clang can create a global initialized to 0 and remove the alloc and copy + teams_done_ptr1 = (uint32_t *)malloc(sizeof(uint32_t)); + *teams_done_ptr1 = 0; + d_teams_done_ptr1 = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid); + omp_target_memcpy(d_teams_done_ptr1, teams_done_ptr1, sizeof(uint32_t), 0, + 0, devid, omp_get_initial_device()); + } + // Making the array_size 64 bits somehow avoids a data_submit and + // data_retrieve.? + const uint64_t LOOP_TEAMS = team_procs1; + const uint64_t LOOP_SIZE = (uint64_t)array_size; + const T LOOP_INITVAL = minval; + if (warp_size == 64) { +#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom \ + : retval) \ + is_device_ptr(d_team_vals1, d_teams_done_ptr1) + for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val1 = LOOP_INITVAL; + const int LOOP_START = 0; + const uint32_t LOOP_STRIDE = 1; + _INNER_LOOP { val1 = (c[i] > val1) ? c[i] : val1; } + // we do not have device numeric_limits so use host numeric_limits + // clang codegen should embed a constant for each datatype to pass to + // xteamr + _MAX_OVERLOAD_64_FCT(val1, &retval, d_team_vals1, d_teams_done_ptr1, + LOOP_INITVAL); + } + } else { +#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom \ + : retval) \ + is_device_ptr(d_team_vals1, d_teams_done_ptr1) + for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val1 = LOOP_INITVAL; + const int LOOP_START = 0; + const uint32_t LOOP_STRIDE = 1; + _INNER_LOOP { val1 = (c[i] > val1) ? c[i] : val1; } + _MAX_OVERLOAD_32_FCT(val1, &retval, d_team_vals1, d_teams_done_ptr1, + LOOP_INITVAL); + } + } + return retval; +} + +template T sim_min(T *c, uint64_t array_size, int warp_size) { + int devid = 0; + const T maxval = std::numeric_limits::max(); + T retval = maxval; + static uint32_t *teams_done_ptr2; + static uint32_t *d_teams_done_ptr2; + static T *d_team_vals2; + static uint32_t team_procs2; + if (!teams_done_ptr2) { + // One-time alloc device array for each teams's reduction value. + team_procs2 = ompx_get_team_procs(devid); + d_team_vals2 = (T *)omp_target_alloc(sizeof(T) * team_procs2, devid); + // Allocate and copy the zero-initialized teams_done counter one time + // because it atomically resets when last team increments it. + // Clang can create a global initialized to 0 and remove the alloc and copy + teams_done_ptr2 = (uint32_t *)malloc(sizeof(uint32_t)); + *teams_done_ptr2 = 0; + d_teams_done_ptr2 = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid); + omp_target_memcpy(d_teams_done_ptr2, teams_done_ptr2, sizeof(uint32_t), 0, + 0, devid, omp_get_initial_device()); + } + // Making the array_size 64 bits avoids a data_submit and data_retrieve. + const uint64_t LOOP_TEAMS = team_procs2; + const uint64_t LOOP_SIZE = (uint64_t)array_size; + const T LOOP_INITVAL = maxval; + if (warp_size == 64) { +#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom \ + : retval) \ + is_device_ptr(d_team_vals2, d_teams_done_ptr2) + for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val2 = LOOP_INITVAL; + const int LOOP_START = 0; + const uint32_t LOOP_STRIDE = 1; + _INNER_LOOP { val2 = (c[i] < val2) ? c[i] : val2; } + _MIN_OVERLOAD_64_FCT(val2, &retval, d_team_vals2, d_teams_done_ptr2, + LOOP_INITVAL); + } + } else { +#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom \ + : retval) \ + is_device_ptr(d_team_vals2, d_teams_done_ptr2) + for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) { + T val2 = LOOP_INITVAL; + const int LOOP_START = 0; + const uint32_t LOOP_STRIDE = 1; + _INNER_LOOP { val2 = (c[i] < val2) ? c[i] : val2; } + _MIN_OVERLOAD_32_FCT(val2, &retval, d_team_vals2, d_teams_done_ptr2, + LOOP_INITVAL); + } + } + return retval; +} + +template +void _check_val(T computed_val, T gold_val, const char *msg) { + double ETOL = 0.0000001; + if (DATA_TYPE_IS_INT) { + if (computed_val != gold_val) { + std::cerr << msg << " FAIL " + << "Integar Value was " << computed_val << " but should be " + << gold_val << std::endl; + test_run_rc = 1; + } + } else { + double dcomputed_val = (double)computed_val; + double dvalgold = (double)gold_val; + double ompErrSum = abs((dcomputed_val - dvalgold) / dvalgold); + if (ompErrSum > ETOL) { + std::cerr << msg << " FAIL " << ompErrSum << " tol:" << ETOL << std::endl + << std::setprecision(15) << "Value was " << computed_val + << " but should be " << gold_val << std::endl; + test_run_rc = 1; + } + } +} + +template +void run_tests(const uint64_t array_size) { + + // FIXME: How do we get warpsize of a device from host? + int warp_size; +#pragma omp target map(tofrom : warp_size) + warp_size = __kmpc_get_warp_size(); + + // Align on 2M boundaries + T *a = (T *)aligned_alloc(2 * 1024 * 1024, sizeof(T) * array_size); + T *b = (T *)aligned_alloc(2 * 1024 * 1024, sizeof(T) * array_size); + T *c = (T *)aligned_alloc(2 * 1024 * 1024, sizeof(T) * array_size); +#pragma omp target enter data map(alloc \ + : a [0:array_size], b [0:array_size], \ + c [0:array_size]) +#pragma omp target teams distribute parallel for + for (int64_t i = 0; i < array_size; i++) { + a[i] = 2; + b[i] = 3; + c[i] = (i + 1); + } + + std::cout << "Running kernels " << repeat_num_times << " times" << std::endl; + std::cout << "Ignoring timing of first " << ignore_times << " runs " + << std::endl; + + double ETOL = 0.0000001; + if (DATA_TYPE_IS_INT) { + std::cout << "Integer Size: " << sizeof(T) << std::endl; + } else { + if (sizeof(T) == sizeof(float)) + std::cout << "Precision: float" << std::endl; + else + std::cout << "Precision: double" << std::endl; + } + + int team_procs = ompx_get_team_procs(omp_get_default_device()); + std::cout << "Warp size:" << warp_size << std::endl; + std::cout << "Team processors:" << team_procs << std::endl; + std::cout << "Array elements: " << array_size << std::endl; + std::cout << "Array size: " << ((array_size * sizeof(T)) / (1024 * 1024)) + << " MB" << std::endl; + + T goldDot = (T)6 * (T)array_size; + T goldMax = (T)array_size; + T goldMin = (T)1; + + double goldDot_d = (double)goldDot; + double goldMax_d = (double)goldMax; + double goldMin_d = (double)goldMin; + + // List of times + std::vector> timings(6); + + // Declare timers + std::chrono::high_resolution_clock::time_point t1, t2; + + // Timing loop + for (unsigned int k = 0; k < repeat_num_times; k++) { + t1 = std::chrono::high_resolution_clock::now(); + T omp_sum = omp_dot(a, b, array_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[0].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val(omp_sum, goldDot, "omp_dot"); + + t1 = std::chrono::high_resolution_clock::now(); + T sim_sum = sim_dot(a, b, array_size, warp_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[1].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val(sim_sum, goldDot, "sim_dot"); + + t1 = std::chrono::high_resolution_clock::now(); + T omp_max_val = omp_max(c, array_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[2].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val(omp_max_val, goldMax, "omp_max"); + + t1 = std::chrono::high_resolution_clock::now(); + T sim_max_val = sim_max(c, array_size, warp_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[3].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val(sim_max_val, goldMax, "sim_max"); + + t1 = std::chrono::high_resolution_clock::now(); + T omp_min_val = omp_min(c, array_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[4].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val(omp_min_val, goldMin, "omp_min"); + + t1 = std::chrono::high_resolution_clock::now(); + T sim_min_val = sim_min(c, array_size, warp_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[5].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val(sim_min_val, goldMin, "sim_min"); + + } // end Timing loop + + // Display timing results + std::cout << std::left << std::setw(12) << "Function" << std::left + << std::setw(12) << "Best-MB/sec" << std::left << std::setw(12) + << " Min (sec)" << std::left << std::setw(12) << " Max" + << std::left << std::setw(12) << "Average" << std::left + << std::setw(12) << "Avg-MB/sec" << std::endl; + + std::cout << std::fixed; + + std::string labels[6] = {"ompdot", "simdot", "ompmax", + "simmax", "ompmin", "simmin"}; + size_t sizes[6] = {2 * sizeof(T) * array_size, 2 * sizeof(T) * array_size, + 1 * sizeof(T) * array_size, 1 * sizeof(T) * array_size, + 1 * sizeof(T) * array_size, 1 * sizeof(T) * array_size}; + + for (int i = 0; i < 6; i++) { + // Get min/max; ignore the first couple results + auto minmax = std::minmax_element(timings[i].begin() + ignore_times, + timings[i].end()); + + // Calculate average; ignore ignore_times + double average = std::accumulate(timings[i].begin() + ignore_times, + timings[i].end(), 0.0) / + (double)(repeat_num_times - ignore_times); + + printf(" %s %8.0f %8.6f %8.6f %8.6f %8.0f\n", + labels[i].c_str(), 1.0E-6 * sizes[i] / (*minmax.first), + (double)*minmax.first, (double)*minmax.second, (double)average, + 1.0E-6 * sizes[i] / (average)); + } +#pragma omp target exit data map(release \ + : a [0:array_size], b [0:array_size], \ + c [0:array_size]) + free(a); + free(b); + free(c); +} + +/// ======================== START COMPLEX TESTING =========================== + +template +void _check_val_complex(TC computed_val_complex, TC gold_val_complex, + const char *msg) { + T gold_val_r = __real__(gold_val_complex); + T computed_val_r = __real__(computed_val_complex); + T gold_val_i = __imag__(gold_val_complex); + T computed_val_i = __imag__(computed_val_complex); + double ETOL = 0.0000001; + double computed_val_r_d = (double)computed_val_r; + double valgold_r_d = (double)gold_val_r; + double ompErrSum_r = abs((computed_val_r_d - valgold_r_d) / valgold_r_d); + double computed_val_i_d = (double)computed_val_i; + double valgold_i_d = (double)gold_val_i; + double ompErrSum_i = abs((computed_val_i_d - valgold_i_d) / valgold_i_d); + if ((ompErrSum_r > ETOL) || (ompErrSum_i > ETOL)) { + std::cerr << msg << " FAIL " << ompErrSum_r << " tol:" << ETOL << std::endl + << std::setprecision(15) << "Value was (" << computed_val_r + << " + " << computed_val_i << " i )" << std::endl + << " but should be (" << gold_val_r << " + " << gold_val_i + << "i) " << std::endl; + test_run_rc = 1; + } +} + +template TC omp_dot_complex(TC *a, TC *b, uint64_t array_size) { + TC dot; + __real__(dot) = 0.0; + __imag__(dot) = 0.0; +#pragma omp target teams distribute parallel for map(tofrom: dot) reduction(+:dot) + for (int64_t i = 0; i < array_size; i++) + dot += a[i] * b[i]; + return dot; +} + +template +TC sim_dot_complex(TC *a, TC *b, uint64_t array_size, int warp_size) { + TC zero_c; + __real__(zero_c) = 0.0; + __imag__(zero_c) = 0.0; + TC dot = zero_c; + int devid = 0; + static uint32_t *teams_done_ptr00 = nullptr; + static uint32_t *d_teams_done_ptr00; + static TC *d_team_vals00; + static uint32_t team_procs00; + if (!teams_done_ptr00) { + // One-time alloc device array for each teams's reduction value. + team_procs00 = ompx_get_team_procs(devid); + d_team_vals00 = (TC *)omp_target_alloc(sizeof(TC) * team_procs00, devid); + // Allocate and copy the zero-initialized teams_done counter one time + // because it atomically resets when last team increments it. + teams_done_ptr00 = (uint32_t *)malloc(sizeof(uint32_t)); + *teams_done_ptr00 = 0; + d_teams_done_ptr00 = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid); + omp_target_memcpy(d_teams_done_ptr00, teams_done_ptr00, sizeof(uint32_t), 0, + 0, devid, omp_get_initial_device()); + } + // Making the array_size 64 bits avoids a data_submit and data_retrieve + const uint64_t LOOP_TEAMS = team_procs00; + const uint64_t LOOP_SIZE = (uint64_t)array_size; + if (warp_size == 64) { +#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom \ + : dot) \ + is_device_ptr(d_team_vals00, d_teams_done_ptr00) + for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) { + TC val00 = zero_c; + constexpr int LOOP_START = 0; + constexpr uint32_t LOOP_STRIDE = 1; + _INNER_LOOP { val00 += a[i] * b[i]; } + _SUM_OVERLOAD_64_FCT(val00, &dot, d_team_vals00, d_teams_done_ptr00, + zero_c); + } + } else { +#pragma omp target teams distribute parallel for num_teams(LOOP_TEAMS) \ + num_threads(_XTEAM_NUM_THREADS) map(tofrom \ + : dot) \ + is_device_ptr(d_team_vals00, d_teams_done_ptr00) + for (unsigned int k = 0; k < (LOOP_TEAMS * _XTEAM_NUM_THREADS); k++) { + TC val00 = zero_c; + constexpr int LOOP_START = 0; + constexpr uint32_t LOOP_STRIDE = 1; + _INNER_LOOP { val00 += a[i] * b[i]; } + _SUM_OVERLOAD_32_FCT(val00, &dot, d_team_vals00, d_teams_done_ptr00, + zero_c); + } + } + return dot; +} + +template +void run_tests_complex(const uint64_t array_size) { + + // FIXME: Is there a way to get warpsize of device from host? + // clang Codegen knows warpsize from GPU type. + int warp_size; +#pragma omp target map(tofrom : warp_size) + warp_size = __kmpc_get_warp_size(); + + TC *a = (TC *)aligned_alloc(2 * 1024 * 1024, sizeof(TC) * array_size); + TC *b = (TC *)aligned_alloc(2 * 1024 * 1024, sizeof(TC) * array_size); + +#pragma omp target enter data map(alloc : a [0:array_size], b [0:array_size]) + TC startA; + __real__(startA) = 1.0; + __imag__(startA) = 1.0; + TC startB; + __real__(startB) = 1.0; + __imag__(startB) = -1.0; + +#pragma omp target teams distribute parallel for + for (int64_t i = 0; i < array_size; i++) { + a[i] = startA; + b[i] = startB; + // a[i] * b[i] = 2 + 0i + } + + std::cout << "Running kernels " << repeat_num_times << " times" << std::endl; + std::cout << "Ignoring timing of first " << ignore_times << " runs " + << std::endl; + + double ETOL = 0.0000001; + if (sizeof(TC) == sizeof(float _Complex)) + std::cout << "Precision: float _Complex" << std::endl; + else + std::cout << "Precision: double _Complex" << std::endl; + + int team_procs = ompx_get_team_procs(omp_get_default_device()); + std::cout << "Warp size:" << warp_size << std::endl; + std::cout << "Team processors:" << team_procs << std::endl; + std::cout << "Array elements: " << array_size << std::endl; + std::cout << "Array size: " << ((array_size * sizeof(TC)) / (1024 * 1024)) + << " MB" << std::endl; + + T goldDotr = T(2) * (T)array_size; + T goldDoti = T(0); + + TC goldDot; + __real__(goldDot) = goldDotr; + __imag__(goldDot) = goldDoti; + + // List of times + std::vector> timings(2); + + // Declare timers + std::chrono::high_resolution_clock::time_point t1, t2; + + // timing loop + for (unsigned int k = 0; k < repeat_num_times; k++) { + t1 = std::chrono::high_resolution_clock::now(); + TC omp_sum = omp_dot_complex(a, b, array_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[0].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val_complex(omp_sum, goldDot, "omp_dot"); + + t1 = std::chrono::high_resolution_clock::now(); + TC sim_sum = sim_dot_complex(a, b, array_size, warp_size); + t2 = std::chrono::high_resolution_clock::now(); + timings[1].push_back( + std::chrono::duration_cast>(t2 - t1) + .count()); + _check_val_complex(sim_sum, goldDot, "sim_dot"); + + } // end timing loop + + // Display timing results + std::cout << std::left << std::setw(12) << "Function" << std::left + << std::setw(12) << "Best-MB/sec" << std::left << std::setw(12) + << " Min (sec)" << std::left << std::setw(12) << " Max" + << std::left << std::setw(12) << "Average" << std::left + << std::setw(12) << "Avg-MB/sec" << std::endl; + + std::cout << std::fixed; + + std::string labels[2] = {"ompdot", "simdot"}; + size_t sizes[2] = {2 * sizeof(TC) * array_size, 2 * sizeof(TC) * array_size}; + + for (int i = 0; i < 2; i++) { + // Get min/max; ignore the first couple results + auto minmax = std::minmax_element(timings[i].begin() + ignore_times, + timings[i].end()); + + // Calculate average; ignore ignore_times + double average = std::accumulate(timings[i].begin() + ignore_times, + timings[i].end(), 0.0) / + (double)(repeat_num_times - ignore_times); + + printf(" %s %8.0f %8.6f %8.6f %8.6f %8.0f\n", + labels[i].c_str(), 1.0E-6 * sizes[i] / (*minmax.first), + (double)*minmax.first, (double)*minmax.second, (double)average, + 1.0E-6 * sizes[i] / (average)); + } +#pragma omp target exit data map(release : a [0:array_size], b [0:array_size]) + free(a); + free(b); +} diff --git a/openmp/libomptarget/test/xteamr/xteamr_host_dev.h b/openmp/libomptarget/test/xteamr/xteamr_host_dev.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/xteamr/xteamr_host_dev.h @@ -0,0 +1,638 @@ +//===---- xteamr_host_dev.h - Device declares and host defs for xteamr --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Cannot include Interface.h for this test, so this is a repeat of headers +// for Xteamr and Rfuns from that file. This file also contains the host +// dummy functions needed to link the test. +// +//===----------------------------------------------------------------------===// + +#if defined(__AMDGCN__) || defined(__NVPTX__) +extern "C" { + +/// +/// Helper functions for high performance cross team reductions. +/// +/// Name decoder: __kmpc_xteamr__x +/// number of warps/waves in team +/// 32 or 64 +/// example: __kmpc_xteam_f_16x64 is the helper function for data +/// type float with fixed teamsize of 1024 threads. +/// There are 48(6 X 8) helper functions. +/// 6 configurations of teamsize are currently created. +/// The most performant configs use max teamsize 1024: 16x64 and 32x32. +/// Currently the Other configs are 8x64, 4x64, 16x32, and 8x32. +/// 8 data types available: double, float, int, uint, long, ulong, +/// double _Complex, and float _complex +/// +/// All helper functions have these 7 args: +/// arg1: the thread local reduction value +/// arg2: pointer to where result is written +/// arg3: global array of team values for this reduction instance +/// arg4: atomic counter of completed teams for this reduction instance +/// arg5: void function pointer to pair reduction function, +/// (e.g. sum(&a,b),min(&a,b),max(&a,b) +/// arg6: equivalent (to arg5) void function pointer of pair reduction +/// function on LDS memory, (e.g. sum(&a,&b),min(&a,&b),max(&a,&b) +/// arg7: Initializing value for the reduction type +/// +#define _RF_LDS volatile __attribute__((address_space(3))) +void __kmpc_xteamr_d_16x64(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv); +void __kmpc_xteamr_f_16x64(float v, float *r_ptr, float *tvals, + uint32_t *td_ptr, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv); +void __kmpc_xteamr_cd_16x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv); +void __kmpc_xteamr_cf_16x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv); +void __kmpc_xteamr_i_16x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv); +void __kmpc_xteamr_ui_16x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv); +void __kmpc_xteamr_l_16x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv); +void __kmpc_xteamr_ul_16x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv); +void __kmpc_xteamr_d_8x64(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv); +void __kmpc_xteamr_f_8x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv); +void __kmpc_xteamr_cd_8x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv); +void __kmpc_xteamr_cf_8x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv); +void __kmpc_xteamr_i_8x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv); +void __kmpc_xteamr_ui_8x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv); +void __kmpc_xteamr_l_8x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv); +void __kmpc_xteamr_ul_8x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv); +void __kmpc_xteamr_d_4x64(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv); +void __kmpc_xteamr_f_4x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv); +void __kmpc_xteamr_cd_4x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv); +void __kmpc_xteamr_cf_4x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv); +void __kmpc_xteamr_i_4x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv); +void __kmpc_xteamr_ui_4x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv); +void __kmpc_xteamr_l_4x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv); +void __kmpc_xteamr_ul_4x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv); +void __kmpc_xteamr_d_32x32(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv); +void __kmpc_xteamr_f_32x32(float v, float *r_ptr, float *tvals, + uint32_t *td_ptr, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv); +void __kmpc_xteamr_cd_32x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv); +void __kmpc_xteamr_cf_32x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv); + +void __kmpc_xteamr_i_32x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv); + +void __kmpc_xteamr_ui_32x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv); +void __kmpc_xteamr_l_32x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv); +void __kmpc_xteamr_ul_32x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv); +void __kmpc_xteamr_d_16x32(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv); +void __kmpc_xteamr_f_16x32(float v, float *r_ptr, float *tvals, + uint32_t *td_ptr, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv); +void __kmpc_xteamr_cd_16x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv); +void __kmpc_xteamr_cf_16x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv); + +void __kmpc_xteamr_i_16x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv); +void __kmpc_xteamr_ui_16x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv); +void __kmpc_xteamr_l_16x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv); +void __kmpc_xteamr_ul_16x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv); +void __kmpc_xteamr_d_8x32(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv); +void __kmpc_xteamr_f_8x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv); +void __kmpc_xteamr_cd_8x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv); +void __kmpc_xteamr_cf_8x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv); +void __kmpc_xteamr_i_8x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv); +void __kmpc_xteamr_ui_8x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv); +void __kmpc_xteamr_l_8x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv); +void __kmpc_xteamr_ul_8x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv); + +/// Builtin pair reduction functions. +/// These become function pointers for arg5 and arg6 of xteamr above. +/// Each pair reduction function must have two variants to support xteamr. +/// The 1st is for TLS memory and the 2nd is for LDS (scratchpad) memory. +/// These are defined in Rfuns.cpp. User defined reductions require +/// that Clang codegen generate these functions. +void __kmpc_rfun_sum_d(double *val, double otherval); +void __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval); +void __kmpc_rfun_sum_f(float *val, float otherval); +void __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval); +void __kmpc_rfun_sum_cd(double _Complex *val, double _Complex otherval); +void __kmpc_rfun_sum_lds_cd(_RF_LDS double _Complex *val, + _RF_LDS double _Complex *otherval); +void __kmpc_rfun_sum_cf(float _Complex *val, float _Complex otherval); +void __kmpc_rfun_sum_lds_cf(_RF_LDS float _Complex *val, + _RF_LDS float _Complex *otherval); +void __kmpc_rfun_sum_i(int *val, int otherval); +void __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval); +void __kmpc_rfun_sum_ui(unsigned int *val, unsigned int otherval); +void __kmpc_rfun_sum_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval); +void __kmpc_rfun_sum_l(long *val, long otherval); +void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); +void __kmpc_rfun_sum_ul(unsigned long *val, unsigned long otherval); +void __kmpc_rfun_sum_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval); +/// Complex variables have no compare, so no min or max for cf and cd. +void __kmpc_rfun_min_d(double *val, double otherval); +void __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval); +void __kmpc_rfun_min_f(float *val, float otherval); +void __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval); +void __kmpc_rfun_min_i(int *val, int otherval); +void __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval); +void __kmpc_rfun_min_ui(unsigned int *val, unsigned int otherval); +void __kmpc_rfun_min_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval); +void __kmpc_rfun_min_l(long *val, long otherval); +void __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); +void __kmpc_rfun_min_ul(unsigned long *val, unsigned long otherval); +void __kmpc_rfun_min_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval); +void __kmpc_rfun_max_d(double *val, double otherval); +void __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval); +void __kmpc_rfun_max_f(float *val, float otherval); +void __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval); +void __kmpc_rfun_max_i(int *val, int otherval); +void __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval); +void __kmpc_rfun_max_ui(unsigned int *val, unsigned int otherval); +void __kmpc_rfun_max_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval); +void __kmpc_rfun_max_l(long *val, long otherval); +void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); +void __kmpc_rfun_max_ul(unsigned long *val, unsigned long otherval); +void __kmpc_rfun_max_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval); +#undef _RF_LDS + +int __kmpc_get_warp_size(); + +} // end of extern "C" + +#else // end of device compilation + +// For host compilation, define null functions for host linking + +extern "C" { +#define _RF_LDS +void __kmpc_xteamr_d_16x64(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv) {} +void __kmpc_xteamr_f_16x64(float v, float *r_ptr, float *tvals, + uint32_t *td_ptr, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv) {} +void __kmpc_xteamr_cd_16x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv) {} +void __kmpc_xteamr_cf_16x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv) {} +void __kmpc_xteamr_i_16x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv) {} +void __kmpc_xteamr_ui_16x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv) {} +void __kmpc_xteamr_l_16x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv) {} +void __kmpc_xteamr_ul_16x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv) {} +void __kmpc_xteamr_d_8x64(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv) {} +void __kmpc_xteamr_f_8x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv) {} +void __kmpc_xteamr_cd_8x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv) {} +void __kmpc_xteamr_cf_8x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv) {} +void __kmpc_xteamr_i_8x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv) {} +void __kmpc_xteamr_ui_8x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv) {} +void __kmpc_xteamr_l_8x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv) {} +void __kmpc_xteamr_ul_8x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv) {} +void __kmpc_xteamr_d_4x64(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv) {} +void __kmpc_xteamr_f_4x64(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv) {} +void __kmpc_xteamr_cd_4x64(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv) {} +void __kmpc_xteamr_cf_4x64(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv) {} +void __kmpc_xteamr_i_4x64(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv) {} +void __kmpc_xteamr_ui_4x64(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv) {} +void __kmpc_xteamr_l_4x64(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv) {} +void __kmpc_xteamr_ul_4x64(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv) {} +void __kmpc_xteamr_d_32x32(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv) {} +void __kmpc_xteamr_f_32x32(float v, float *r_ptr, float *tvals, + uint32_t *td_ptr, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv) {} +void __kmpc_xteamr_cd_32x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv) {} +void __kmpc_xteamr_cf_32x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv) {} +void __kmpc_xteamr_i_32x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv) {} +void __kmpc_xteamr_ui_32x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv) {} +void __kmpc_xteamr_l_32x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv) {} +void __kmpc_xteamr_ul_32x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv) {} +void __kmpc_xteamr_d_16x32(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv) {} +void __kmpc_xteamr_f_16x32(float v, float *r_ptr, float *tvals, + uint32_t *td_ptr, void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv) {} +void __kmpc_xteamr_cd_16x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv) {} +void __kmpc_xteamr_cf_16x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv) {} +void __kmpc_xteamr_i_16x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv) {} +void __kmpc_xteamr_ui_16x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv) {} +void __kmpc_xteamr_l_16x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv) {} +void __kmpc_xteamr_ul_16x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv) {} +void __kmpc_xteamr_d_8x32(double v, double *r_ptr, double *tvals, + uint32_t *td_ptr, void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double iv) {} +void __kmpc_xteamr_f_8x32(float v, float *r_ptr, float *tvals, uint32_t *td_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float iv) {} +void __kmpc_xteamr_cd_8x32(double _Complex v, double _Complex *r_ptr, + double _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(double _Complex *, double _Complex), + void (*_rf_lds)(_RF_LDS double _Complex *, + _RF_LDS double _Complex *), + const double _Complex iv) {} +void __kmpc_xteamr_cf_8x32(float _Complex v, float _Complex *r_ptr, + float _Complex *tvals, uint32_t *td_ptr, + void (*_rf)(float _Complex *, float _Complex), + void (*_rf_lds)(_RF_LDS float _Complex *, + _RF_LDS float _Complex *), + const float _Complex iv) {} +void __kmpc_xteamr_i_8x32(int v, int *r_ptr, int *tvals, uint32_t *td_ptr, + void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), + const int iv) {} +void __kmpc_xteamr_ui_8x32(uint32_t v, uint32_t *r_ptr, uint32_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint32_t *, uint32_t), + void (*_rf_lds)(_RF_LDS uint32_t *, + _RF_LDS uint32_t *), + const uint32_t iv) {} +void __kmpc_xteamr_l_8x32(long v, long *r_ptr, long *tvals, uint32_t *td_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), + const long iv) {} +void __kmpc_xteamr_ul_8x32(uint64_t v, uint64_t *r_ptr, uint64_t *tvals, + uint32_t *td_ptr, void (*_rf)(uint64_t *, uint64_t), + void (*_rf_lds)(_RF_LDS uint64_t *, + _RF_LDS uint64_t *), + const uint64_t iv) {} + +/// Builtin pair reduction functions. +/// These become function pointers for arg5 and arg6 of xteamr above. +/// Each pair reduction function must have two variants to support xteamr. +/// The 1st is for TLS memory and the 2nd is for LDS (scratchpad) memory. +/// These are defined in Rfuns.cpp. User defined reductions require +/// that Clang codegen generate these functions. +void __kmpc_rfun_sum_d(double *val, double otherval) {} +void __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {} +void __kmpc_rfun_sum_f(float *val, float otherval) {} +void __kmpc_rfun_sum_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) {} +void __kmpc_rfun_sum_cd(double _Complex *val, double _Complex otherval) {} +void __kmpc_rfun_sum_lds_cd(_RF_LDS double _Complex *val, + _RF_LDS double _Complex *otherval) {} +void __kmpc_rfun_sum_cf(float _Complex *val, float _Complex otherval) {} +void __kmpc_rfun_sum_lds_cf(_RF_LDS float _Complex *val, + _RF_LDS float _Complex *otherval) {} +void __kmpc_rfun_sum_i(int *val, int otherval) {} +void __kmpc_rfun_sum_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) {} +void __kmpc_rfun_sum_ui(unsigned int *val, unsigned int otherval) {} +void __kmpc_rfun_sum_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval) {} +void __kmpc_rfun_sum_l(long *val, long otherval) {} +void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) {} +void __kmpc_rfun_sum_ul(unsigned long *val, unsigned long otherval) {} +void __kmpc_rfun_sum_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval) {} +/// Complex variables have no compare, so no min or max for cf and cd. +void __kmpc_rfun_min_d(double *val, double otherval) {} +void __kmpc_rfun_min_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {} +void __kmpc_rfun_min_f(float *val, float otherval) {} +void __kmpc_rfun_min_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) {} +void __kmpc_rfun_min_i(int *val, int otherval) {} +void __kmpc_rfun_min_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) {} +void __kmpc_rfun_min_ui(unsigned int *val, unsigned int otherval) {} +void __kmpc_rfun_min_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval) {} +void __kmpc_rfun_min_l(long *val, long otherval) {} +void __kmpc_rfun_min_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) {} +void __kmpc_rfun_min_ul(unsigned long *val, unsigned long otherval) {} +void __kmpc_rfun_min_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval) {} +void __kmpc_rfun_max_d(double *val, double otherval) {} +void __kmpc_rfun_max_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {} +void __kmpc_rfun_max_f(float *val, float otherval) {} +void __kmpc_rfun_max_lds_f(_RF_LDS float *val, _RF_LDS float *otherval) {} +void __kmpc_rfun_max_i(int *val, int otherval) {} +void __kmpc_rfun_max_lds_i(_RF_LDS int *val, _RF_LDS int *otherval) {} +void __kmpc_rfun_max_ui(unsigned int *val, unsigned int otherval) {} +void __kmpc_rfun_max_lds_ui(_RF_LDS unsigned int *val, + _RF_LDS unsigned int *otherval) {} +void __kmpc_rfun_max_l(long *val, long otherval) {} +void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval) {} +void __kmpc_rfun_max_ul(unsigned long *val, unsigned long otherval) {} +void __kmpc_rfun_max_lds_ul(_RF_LDS unsigned long *val, + _RF_LDS unsigned long *otherval) {} +#undef _RF_LDS + +// host version of __kmpc_get_warp_size +int __kmpc_get_warp_size() { + printf(" executing on host \n"); + return -1; +} + +} // end of extern "C" + +#endif // end of host definitions