Diff 471196

openmp/libomptarget/DeviceRTL/CMakeLists.txt

Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	set(include_files
${include_directory}/Debug.h		${include_directory}/Debug.h
${include_directory}/Interface.h		${include_directory}/Interface.h
${include_directory}/LibC.h		${include_directory}/LibC.h
${include_directory}/Mapping.h		${include_directory}/Mapping.h
${include_directory}/State.h		${include_directory}/State.h
${include_directory}/Synchronization.h		${include_directory}/Synchronization.h
${include_directory}/Types.h		${include_directory}/Types.h
${include_directory}/Utils.h		${include_directory}/Utils.h
		${include_directory}/Xteamr.h
)		)

set(src_files		set(src_files
${source_directory}/Configuration.cpp		${source_directory}/Configuration.cpp
${source_directory}/Debug.cpp		${source_directory}/Debug.cpp
${source_directory}/Kernel.cpp		${source_directory}/Kernel.cpp
${source_directory}/LibC.cpp		${source_directory}/LibC.cpp
${source_directory}/Mapping.cpp		${source_directory}/Mapping.cpp
${source_directory}/Misc.cpp		${source_directory}/Misc.cpp
${source_directory}/Parallelism.cpp		${source_directory}/Parallelism.cpp
${source_directory}/Reduction.cpp		${source_directory}/Reduction.cpp
${source_directory}/State.cpp		${source_directory}/State.cpp
${source_directory}/Synchronization.cpp		${source_directory}/Synchronization.cpp
${source_directory}/Tasking.cpp		${source_directory}/Tasking.cpp
${source_directory}/Utils.cpp		${source_directory}/Utils.cpp
${source_directory}/Workshare.cpp		${source_directory}/Workshare.cpp
		${source_directory}/Xteamr.cpp
)		)

set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512)		set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512)
set(link_opt_flags -O3 -openmp-opt-disable)		set(link_opt_flags -O3 -openmp-opt-disable)
set(link_export_flag -passes=internalize -internalize-public-api-file=${source_directory}/exports)		set(link_export_flag -passes=internalize -internalize-public-api-file=${source_directory}/exports)
set(link_extract_flag --func='__keep_alive' --delete)		set(link_extract_flag --func='__keep_alive' --delete)

# Prepend -I to each list element		# Prepend -I to each list element
▲ Show 20 Lines • Show All 152 Lines • Show Last 20 Lines

openmp/libomptarget/DeviceRTL/include/Interface.h

	//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===//			//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef OMPTARGET_DEVICERTL_INTERFACE_H			#ifndef OMPTARGET_DEVICERTL_INTERFACE_H
	#define OMPTARGET_DEVICERTL_INTERFACE_H			#define OMPTARGET_DEVICERTL_INTERFACE_H

	#include "Types.h"			#include "Types.h"
				#include "Xteamr.h"

	/// External API			/// External API
	///			///
	///{			///{

	extern "C" {			extern "C" {

	/// ICV: dyn-var, constant 0			/// ICV: dyn-var, constant 0
	▲ Show 20 Lines • Show All 333 Lines • Show Last 20 Lines

openmp/libomptarget/DeviceRTL/include/Xteamr.h

This file was added.

				//===---------------- Xteamr.h - OpenMP interface ----------------- C++ -*-===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				// Modifications Copyright (c) 2022 Advanced Micro Devices, All rights reserved.
				// Notified per clause 4(b) of the license.
				//
				//===----------------------------------------------------------------------===//
				//
				// DeviceRTL Header file: Xteamr.h
				// External __kmpc headers for cross team reduction functions defined
				// in DeviceRTL/src/Xteamr.cpp. Clang generates a call to one of these
				// functions when it encounter a reduction. The specific function depends
				// on datatype, warpsize, and number of waves in the teamsize. The number
				// of waves must be a power of 2 and the total number of threads must
				// be greater than or equal to the number of teams.
				//
				//===----------------------------------------------------------------------===//

				#ifndef OMPTARGET_DEVICERTL_XTEAMR_H
				#define OMPTARGET_DEVICERTL_XTEAMR_H
				#include "Types.h"

				#define _CD double _Complex
				#define _CF float _Complex
				#define _UI unsigned int
				#define _UL unsigned long
				#define _INLINE_ATTR_ __attribute__((flatten, always_inline))
				#define _RF_LDS volatile __attribute__((address_space(3)))

				extern "C" {
				/// External cross team reduction (xteamr) helper functions
				///
				/// The template for name of xteamr helper function is:
				/// __kmpc_xteamr_<dtype>_<waves>x<WSZ> where
				/// <dtype> is letter(s) representing data type, e.g. d=double
				/// <waves> number of waves in thread block
				/// <WSZ> warp size, 32 or 64
				/// So <waves> x <WSZ> is the number of threads per team.
				/// Example: __kmpc_xteamr_d_16x64 is the reduction helper function
				/// for all reductions with data type double using 1024 threads
				/// per team.
				/// All xteamr helper functions are defined in Xteamr.cpp. They each call the
				/// internal templated function _xteam_reduction also defined in Xteamr.cpp.
				/// Clang/flang code generation for C, C++, and FORTRAN instantiate a call to
				/// a helper function for each reduction used in an OpenMP target region.
				///
				/// \param Input thread local reduction value
				/// \param Pointer to result value
				/// \param Global array of team values for this reduction instance
				/// \param Pointer to atomic counter of completed teams
				/// \param Function pointer to reduction function (sum,min,max)
				/// \param Function pointer to reduction function on LDS memory
				/// \param Reduction null value
				/// \param Outer loop iteration value, 0 to numteams*numthreads
				/// \param Number of teams
				void _INLINE_ATTR_ __kmpc_xteamr_d_16x64(
				double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *), const double rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_f_16x64(
				float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *), const float rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_cd_16x64(
				_CD v, _CD r_ptr, _CD tvs, uint32_t td, void (_rf)(_CD *, _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *), const _CD rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_cf_16x64(
				_CF v, _CF r_ptr, _CF tvs, uint32_t td, void (_rf)(_CF *, _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *), const _CF rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_i_16x64(
				int v, int r_ptr, int tvs, uint32_t td, void (_rf)(int *, int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *), const int rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_ui_16x64(
				_UI v, _UI r_ptr, _UI tvs, uint32_t td, void (_rf)(_UI *, _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *), const _UI rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_l_16x64(
				long v, long r_ptr, long tvs, uint32_t td, void (_rf)(long *, long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *), const long rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_ul_16x64(
				_UL v, _UL r_ptr, _UL tvs, uint32_t td, void (_rf)(_UL *, _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *), const _UL rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_d_32x32(
				double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *), const double rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_f_32x32(
				float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *), const float rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_cd_32x32(
				_CD v, _CD r_ptr, _CD tvs, uint32_t td, void (_rf)(_CD *, _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *), const _CD rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_cf_32x32(
				_CF v, _CF r_ptr, _CF tvs, uint32_t td, void (_rf)(_CF *, _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *), const _CF rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_i_32x32(
				int v, int r_ptr, int tvs, uint32_t td, void (_rf)(int *, int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *), const int rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_ui_32x32(
				_UI v, _UI r_ptr, _UI tvs, uint32_t td, void (_rf)(_UI *, _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *), const _UI rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_l_32x32(
				long v, long r_ptr, long tvs, uint32_t td, void (_rf)(long *, long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *), const long rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_ul_32x32(
				_UL v, _UL r_ptr, _UL tvs, uint32_t td, void (_rf)(_UL *, _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *), const _UL rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_d_8x64(
				double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *), const double rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_f_8x64(
				float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *), const float rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_cd_8x64(
				_CD v, _CD r_ptr, _CD tvs, uint32_t td, void (_rf)(_CD *, _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *), const _CD rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_cf_8x64(
				_CF v, _CF r_ptr, _CF tvs, uint32_t td, void (_rf)(_CF *, _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *), const _CF rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_i_8x64(
				int v, int r_ptr, int tvs, uint32_t td, void (_rf)(int *, int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *), const int rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_ui_8x64(
				_UI v, _UI r_ptr, _UI tvs, uint32_t td, void (_rf)(_UI *, _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *), const _UI rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_l_8x64(
				long v, long r_ptr, long tvs, uint32_t td, void (_rf)(long *, long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *), const long rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_ul_8x64(
				_UL v, _UL r_ptr, _UL tvs, uint32_t td, void (_rf)(_UL *, _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *), const _UL rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_d_16x32(
				double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *), const double rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_f_16x32(
				float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *), const float rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_cd_16x32(
				_CD v, _CD r_ptr, _CD tvs, uint32_t td, void (_rf)(_CD *, _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *), const _CD rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_cf_16x32(
				_CF v, _CF r_ptr, _CF tvs, uint32_t td, void (_rf)(_CF *, _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *), const _CF rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_i_16x32(
				int v, int r_ptr, int tvs, uint32_t td, void (_rf)(int *, int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *), const int rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_ui_16x32(
				_UI v, _UI r_ptr, _UI tvs, uint32_t td, void (_rf)(_UI *, _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *), const _UI rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_l_16x32(
				long v, long r_ptr, long tvs, uint32_t td, void (_rf)(long *, long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *), const long rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_ul_16x32(
				_UL v, _UL r_ptr, _UL tvs, uint32_t td, void (_rf)(_UL *, _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *), const _UL rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_d_4x64(
				double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *), const double rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_f_4x64(
				float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *), const float rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_cd_4x64(
				_CD v, _CD r_ptr, _CD tvs, uint32_t td, void (_rf)(_CD *, _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *), const _CD rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_cf_4x64(
				_CF v, _CF r_ptr, _CF tvs, uint32_t td, void (_rf)(_CF *, _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *), const _CF rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_i_4x64(
				int v, int r_ptr, int tvs, uint32_t td, void (_rf)(int *, int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *), const int rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_ui_4x64(
				_UI v, _UI r_ptr, _UI tvs, uint32_t td, void (_rf)(_UI *, _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *), const _UI rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_l_4x64(
				long v, long r_ptr, long tvs, uint32_t td, void (_rf)(long *, long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *), const long rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_ul_4x64(
				_UL v, _UL r_ptr, _UL tvs, uint32_t td, void (_rf)(_UL *, _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *), const _UL rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_d_8x32(
				double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *), const double rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_f_8x32(
				float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *), const float rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_cd_8x32(
				_CD v, _CD r_ptr, _CD tvs, uint32_t td, void (_rf)(_CD *, _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *), const _CD rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_cf_8x32(
				_CF v, _CF r_ptr, _CF tvs, uint32_t td, void (_rf)(_CF *, _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *), const _CF rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_i_8x32(
				int v, int r_ptr, int tvs, uint32_t td, void (_rf)(int *, int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *), const int rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_ui_8x32(
				_UI v, _UI r_ptr, _UI tvs, uint32_t td, void (_rf)(_UI *, _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *), const _UI rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_l_8x32(
				long v, long r_ptr, long tvs, uint32_t td, void (_rf)(long *, long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *), const long rnv,
				const uint64_t k, const uint32_t numteams);
				/// Cross team reduction (xteamr) helper function, see documentation above.
				void _INLINE_ATTR_ __kmpc_xteamr_ul_8x32(
				_UL v, _UL r_ptr, _UL tvs, uint32_t td, void (_rf)(_UL *, _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *), const _UL rnv,
				const uint64_t k, const uint32_t numteams);

				/// Built-in pair reduction functions used as a function pointer in arguments
				/// to cross team reduction (xteamr) helper functions defined above.
				///
				/// The template for the name of built-in pair reduction functions is
				/// __kmpc_rfun_<fct>_<dtype> where
				/// <fct> is function name (e.g.sum,min,max)
				/// <dtype> is letter(s) representing data type, e.g. d=double
				///
				/// All built-in pair reduction functions are defined in Xteamr.cpp.
				/// Clang/flang code generation for C, C++, and FORTRAN use function pointers
				/// to built-in pair reduction functions when generating a call to xteamr
				/// helper functions.
				///
				/// \param Pointer to first TLS value where result is placed
				/// \param The 2nd TLS value used in the pair reduction function
				void __kmpc_rfun_sum_d(double *val, double otherval);

				/// LDS Built-in pair reduction functions used as a function pointer in
				/// arguments to cross team reduction (xteamr) helper functions.
				/// The LDS pair reduction function only differs from the pair reduction
				/// function in that the arguments use LDS storage.
				///
				/// The template for the name of LDS built-in pair reduction functions is
				/// __kmpc_rfun_<fct>_lds_<dtype> where
				/// <fct> is function name (e.g.sum,min,max)
				/// <dtype> is letter(s) representing data type, e.g. d=double
				///
				/// All built-in pair reduction functions are defined in Xteamr.cpp.
				/// Clang/flang code generation for C, C++, and FORTRAN use function pointers
				/// to built-in pair reduction functions when generating a call to xteamr
				/// helper functions.
				///
				/// \param Pointer to the 1st value in LDS storage where result is placed.
				/// \param Pointer to the 2nd value in LDS storage.
				void __kmpc_rfun_sum_lds_d(_RF_LDS double val, _RF_LDS double otherval);

				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_f(float *val, float otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_lds_f(_RF_LDS float val, _RF_LDS float otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_cd(_CD *val, _CD otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_lds_cd(_RF_LDS _CD val, _RF_LDS _CD otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_cf(_CF *val, _CF otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_lds_cf(_RF_LDS _CF val, _RF_LDS _CF otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_i(int *val, int otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_lds_i(_RF_LDS int val, _RF_LDS int otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_ui(_UI *val, _UI otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_lds_ui(_RF_LDS _UI val, _RF_LDS _UI otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_l(long *val, long otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_lds_l(_RF_LDS long val, _RF_LDS long otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_ul(_UL *val, _UL otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_sum_lds_ul(_RF_LDS _UL val, _RF_LDS _UL otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_max_d(double *val, double otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_max_lds_d(_RF_LDS double val, _RF_LDS double otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_max_f(float *val, float otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_max_lds_f(_RF_LDS float val, _RF_LDS float otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_max_i(int *val, int otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_max_lds_i(_RF_LDS int val, _RF_LDS int otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_max_ui(_UI *val, _UI otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_max_lds_ui(_RF_LDS _UI val, _RF_LDS _UI otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_max_l(long *val, long otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_max_lds_l(_RF_LDS long val, _RF_LDS long otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_max_ul(_UL *val, _UL otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_max_lds_ul(_RF_LDS _UL val, _RF_LDS _UL otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_min_d(double *val, double otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_min_lds_d(_RF_LDS double val, _RF_LDS double otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_min_f(float *val, float otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_min_lds_f(_RF_LDS float val, _RF_LDS float otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_min_i(int *val, int otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_min_lds_i(_RF_LDS int val, _RF_LDS int otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_min_ui(_UI *val, _UI otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_min_lds_ui(_RF_LDS _UI val, _RF_LDS _UI otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_min_l(long *val, long otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_min_lds_l(_RF_LDS long val, _RF_LDS long otherval);
				/// Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_min_ul(_UL *val, _UL otherval);
				/// LDS Built-in pair reduction function, see documentation above.
				void __kmpc_rfun_min_lds_ul(_RF_LDS _UL val, _RF_LDS _UL otherval);
				} // end extern C

				#undef _CD
				#undef _CF
				#undef _UI
				#undef _UL
				#undef _INLINE_ATTR_
				#undef _RF_LDS

				#endif // of ifndef OMPTARGET_DEVICERTL_XTEAMR_H

openmp/libomptarget/DeviceRTL/src/Xteamr.cpp

This file was added.

				//===---- Xteamr.cpp - OpenMP cross team helper functions ---- C++ -*-===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This file contains helper functions for cross team reductions
				//
				//===----------------------------------------------------------------------===//

				#include "Xteamr.h"
				#include "Debug.h"
				#include "Interface.h"
				#include "Mapping.h"
				#include "State.h"
				#include "Synchronization.h"
				#include "Types.h"
				#include "Utils.h"

				#define __XTEAM_SHARED_LDS volatile __attribute__((address_space(3)))

				using namespace _OMP;

				#pragma omp begin declare target device_type(nohost)

				// Headers for specialized shfl_xor
				double xteamr_shfl_xor_d(double var, const int lane_mask, const uint32_t width);
				float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width);
				int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width);
				double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask,
				const uint32_t width);
				float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask,
				const uint32_t width);

				// Define the arch (amdgcn vs nvptx) variants of shfl

				#pragma omp begin declare variant match(device = {arch(amdgcn)})
				int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) {
				int self = mapping::getThreadIdInWarp(); // __lane_id();
				int index = self ^ lane_mask;
				index = index >= ((self + width) & ~(width - 1)) ? self : index;
				return __builtin_amdgcn_ds_bpermute(index << 2, var);
				}
				double xteamr_shfl_xor_d(double var, const int lane_mask,
				const uint32_t width) {
				static_assert(sizeof(double) == 2 * sizeof(int), "");
				static_assert(sizeof(double) == sizeof(uint64_t), "");

				int tmp[2];
				__builtin_memcpy(tmp, &var, sizeof(tmp));
				tmp[0] = xteamr_shfl_xor_int(tmp[0], lane_mask, width);
				tmp[1] = xteamr_shfl_xor_int(tmp[1], lane_mask, width);

				uint64_t tmp0 =
				(static_cast<uint64_t>(tmp[1]) << 32ull) \| static_cast<uint32_t>(tmp[0]);
				double tmp1;
				__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
				return tmp1;
				}
				#pragma omp end declare variant

				#pragma omp begin declare variant match( \
				device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})

				int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) {
				return __nvvm_shfl_sync_bfly_i32(0xFFFFFFFF, var, lane_mask, 0x1f);
				}
				double xteamr_shfl_xor_d(double var, int laneMask, const uint32_t width) {
				unsigned lo, hi;
				asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
				hi = xteamr_shfl_xor_int(hi, laneMask, width);
				lo = xteamr_shfl_xor_int(lo, laneMask, width);
				asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
				return var;
				}
				#pragma omp end declare variant

				float xteamr_shfl_xor_f(float var, const int lane_mask, const uint32_t width) {
				union {
				int i;
				unsigned u;
				float f;
				} tmp;
				tmp.f = var;
				tmp.i = xteamr_shfl_xor_int(tmp.i, lane_mask, width);
				return tmp.f;
				}
				double _Complex xteamr_shfl_xor_cd(double _Complex var, const int lane_mask,
				const uint32_t width) {
				__real__(var) = xteamr_shfl_xor_d(__real__(var), lane_mask, width);
				__imag__(var) = xteamr_shfl_xor_d(__imag__(var), lane_mask, width);
				return var;
				}
				float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask,
				const uint32_t width) {
				__real__(var) = xteamr_shfl_xor_f(__real__(var), lane_mask, width);
				__imag__(var) = xteamr_shfl_xor_f(__imag__(var), lane_mask, width);
				return var;
				}

				// tag dispatching of type specific shfl_xor, get_low, and get_high
				struct _d_tag {};
				struct _f_tag {};
				struct _cd_tag {};
				struct _cf_tag {};
				struct _i_tag {};
				struct _ui_tag {};
				struct _l_tag {};
				struct _ul_tag {};
				template <typename T> struct __dispatch_tag;
				template <> struct __dispatch_tag<double> {
				typedef _d_tag type;
				};
				template <> struct __dispatch_tag<float> {
				typedef _f_tag type;
				};
				template <> struct __dispatch_tag<double _Complex> {
				typedef _cd_tag type;
				};
				template <> struct __dispatch_tag<float _Complex> {
				typedef _cf_tag type;
				};
				template <> struct __dispatch_tag<int> {
				typedef _i_tag type;
				};
				template <> struct __dispatch_tag<unsigned int> {
				typedef _ui_tag type;
				};
				template <> struct __dispatch_tag<long> {
				typedef _l_tag type;
				};
				template <> struct __dispatch_tag<unsigned long> {
				typedef _ul_tag type;
				};
				template <const uint32_t _WSZ>
				double xteamr_shfl_xor(_d_tag tag, double var, const int lane_mask) {
				return xteamr_shfl_xor_d(var, lane_mask, _WSZ);
				}
				template <const uint32_t _WSZ>
				float xteamr_shfl_xor(_f_tag tag, float var, const int lane_mask) {
				return xteamr_shfl_xor_f(var, lane_mask, _WSZ);
				}
				template <const uint32_t _WSZ>
				double _Complex xteamr_shfl_xor(_cd_tag tag, double _Complex var,
				const int lane_mask) {
				return xteamr_shfl_xor_cd(var, lane_mask, _WSZ);
				}
				template <const uint32_t _WSZ>
				float _Complex xteamr_shfl_xor(_cf_tag tag, float _Complex var,
				const int lane_mask) {
				return xteamr_shfl_xor_cf(var, lane_mask, _WSZ);
				}
				template <const uint32_t _WSZ>
				int xteamr_shfl_xor(_i_tag tag, int var, const int lane_mask) {
				return xteamr_shfl_xor_int(var, lane_mask, _WSZ);
				}
				template <const uint32_t _WSZ>
				unsigned int xteamr_shfl_xor(_ui_tag tag, unsigned int var,
				const int lane_mask) {
				return xteamr_shfl_xor_int(var, lane_mask, _WSZ);
				}
				template <const uint32_t _WSZ>
				long xteamr_shfl_xor(_l_tag tag, long var, const int lane_mask) {
				return xteamr_shfl_xor_d(var, lane_mask, _WSZ);
				}
				template <const uint32_t _WSZ>
				unsigned long xteamr_shfl_xor(_ul_tag tag, unsigned long var,
				const int lane_mask) {
				return xteamr_shfl_xor_d(var, lane_mask, _WSZ);
				}

				template <typename T, const uint32_t _WSZ>
				T xteamr_shfl_xor(T var, const int lane_mask) {
				typedef typename __dispatch_tag<T>::type tag;
				return xteamr_shfl_xor<_WSZ>(tag(), var, lane_mask);
				}

				/// Templated internal function used by all extern typed reductions
				///
				/// \param Template typename parameter T
				/// \param Template parameter for number of waves, must be power of two
				/// \param Template parameter for warp size, 32 o 64
				///
				/// \param Input thread local (TLS) value for warp shfl reduce
				/// \param Pointer to result value, also used in final reduction
				/// \param Global array of team values for this reduction only
				/// \param Pointer to atomically accessed teams done counter
				/// \param Function pointer to TLS pair reduction function
				/// \param Function pointer to LDS pair reduction function
				/// \param Reduction null value, used for partial waves
				/// \param The iteration value from 0 to (NumTeams*_NUM_THREADS)-1
				/// \param The number of teams participating in reduction

				template <typename T, const int32_t _NW, const int32_t _WSZ>
				__attribute__((flatten, always_inline)) void _xteam_reduction(
				T val, T r_ptr, T team_vals, uint32_t *teams_done_ptr,
				void (_rf)(T , T),
				void (_rf_lds)(__XTEAM_SHARED_LDS T , __XTEAM_SHARED_LDS T *),
				const T rnv, const uint64_t k, const uint32_t NumTeams) {

				// More efficient to derive these constants than get from mapped API
				constexpr uint32_t _NT = _NW * _WSZ;
				const uint32_t omp_thread_num = k % _NT;
				const uint32_t omp_team_num = k / _NT;
				const uint32_t wave_num = omp_thread_num / _WSZ;
				const uint32_t lane_num = omp_thread_num % _WSZ;

				static __XTEAM_SHARED_LDS T xwave_lds[_NW + 1];

				// Cuda may restrict max threads, so clear unused wave values
				#ifdef __NVPTX__
				if (_NW == 32) {
				if (omp_thread_num == 0) {
				for (uint32_t i = (omp_get_num_threads() / 32); i < _NW; i++)
				xwave_lds[i] = rnv;
				}
				}
				#endif

				// Binary reduce each wave, then copy to xwave_lds[wave_num]
				for (unsigned int offset = _WSZ / 2; offset > 0; offset >>= 1)
				(*_rf)(&val, xteamr_shfl_xor<T, _WSZ>(val, offset));
				if (lane_num == 0)
				xwave_lds[wave_num] = val;

				// Binary reduce all wave values into wave_lds[0]
				_OMP::synchronize::threadsAligned();
				for (unsigned int offset = _NW / 2; offset > 0; offset >>= 1) {
				if (omp_thread_num < offset)
				(*_rf_lds)(&(xwave_lds[omp_thread_num]),
				&(xwave_lds[omp_thread_num + offset]));
				}
				// No sync needed here from last reduction in LDS loop
				// because we only need xwave_lds[0] correct on thread 0.

				// Save the teams reduced value in team_vals global array
				// and atomically increment teams_done counter.
				static __XTEAM_SHARED_LDS uint32_t td;
				if (omp_thread_num == 0) {
				team_vals[omp_team_num] = xwave_lds[0];
				td = atomic::inc(teams_done_ptr, NumTeams - 1u, atomic::seq_cst);
				}

				// This sync needed so all threads from last team see the shared volatile
				// value td (teams done counter) so they know they are in the last team.
				_OMP::synchronize::threadsAligned();

				// If td counter reaches NumTeams-1, this is the last team.
				// The team number of this last team is nondeterministic.
				if (td == (NumTeams - 1u)) {

				// All threads from last completed team enter here.
				// All other teams exit the helper function.

				// To use TLS shfl reduce, copy team values to TLS val.
				// NumTeams must be <= _NUM_THREADS here.
				val = (omp_thread_num < NumTeams) ? team_vals[omp_thread_num] : rnv;

				// Need sync here to prepare for TLS shfl reduce.
				_OMP::synchronize::threadsAligned();

				// Reduce each wave into xwave_lds[wave_num]
				for (unsigned int offset = _WSZ / 2; offset > 0; offset >>= 1)
				(*_rf)(&val, xteamr_shfl_xor<T, _WSZ>(val, offset));
				if (lane_num == 0)
				xwave_lds[wave_num] = val;

				// To get final result, we know wave_lds[0] is done
				// Sync needed here to ensure wave_lds[i!=0] are correct.
				_OMP::synchronize::threadsAligned();

				// Typically only a few usable waves even for large GPUs.
				// No gain parallelizing these last few reductions.
				// So do reduction on thread 0 into lane 0's LDS val.
				if (omp_thread_num == 0) {
				unsigned int usableWaves = ((NumTeams - 1) / _WSZ) + 1;
				// Reduce with the original result value.
				xwave_lds[usableWaves] = *r_ptr;
				for (unsigned int kk = 1; kk <= usableWaves; kk++)
				(*_rf_lds)(&xwave_lds[0], &xwave_lds[kk]);

				*r_ptr = xwave_lds[0];
				}

				// This sync needed to prevent warps in last team from starting
				// if there was another reduction.
				_OMP::synchronize::threadsAligned();
				}
				}

				// Calls to these __kmpc extern C functions are created in clang codegen
				// for FORTRAN, c, and C++. They may also be used for sumulation and testing.
				// The headers for these extern C functions are in ../include/Interface.h
				// The compiler builds the name based on data type,
				// number of waves in the team,and warpsize.
				//
				#define _EXT_ATTR extern "C" __attribute__((flatten, always_inline)) void
				_EXT_ATTR
				__kmpc_xteamr_d_16x64(double v, double r_ptr, double tvals, uint32_t *td_ptr,
				void (_rf)(double , double),
				void (_rf_lds)(__XTEAM_SHARED_LDS double ,
				__XTEAM_SHARED_LDS double *),
				const double iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<double, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_f_16x64(float v, float r_ptr, float tvals, uint32_t *td_ptr,
				void (_rf)(float , float),
				void (_rf_lds)(__XTEAM_SHARED_LDS float ,
				__XTEAM_SHARED_LDS float *),
				const float iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<float, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}

				_EXT_ATTR
				__kmpc_xteamr_cd_16x64(double _Complex v, double _Complex *r_ptr,
				double _Complex tvals, uint32_t td_ptr,
				void (_rf)(double _Complex , double _Complex),
				void (_rf_lds)(__XTEAM_SHARED_LDS double _Complex ,
				__XTEAM_SHARED_LDS double _Complex *),
				double _Complex iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<double _Complex, 16, 64>(v, r_ptr, tvals, td_ptr, _rf,
				_rf_lds, iv, k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_cf_16x64(float _Complex v, float _Complex *r_ptr,
				float _Complex tvals, uint32_t td_ptr,
				void (_rf)(float _Complex , float _Complex),
				void (_rf_lds)(__XTEAM_SHARED_LDS float _Complex ,
				__XTEAM_SHARED_LDS float _Complex *),
				float _Complex iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<float _Complex, 16, 64>(v, r_ptr, tvals, td_ptr, _rf,
				_rf_lds, iv, k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_i_16x64(int v, int r_ptr, int tvals, uint32_t *td_ptr,
				void (_rf)(int , int),
				void (_rf_lds)(__XTEAM_SHARED_LDS int ,
				__XTEAM_SHARED_LDS int *),
				const int iv, const uint64_t k, const uint32_t numteams) {
				_xteam_reduction<int, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_ui_16x64(uint32_t v, uint32_t r_ptr, uint32_t tvals,
				uint32_t td_ptr, void (_rf)(uint32_t *, uint32_t),
				void (_rf_lds)(__XTEAM_SHARED_LDS uint32_t ,
				__XTEAM_SHARED_LDS uint32_t *),
				const uint32_t iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<uint32_t, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv,
				k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_l_16x64(long v, long r_ptr, long tvals, uint32_t *td_ptr,
				void (_rf)(long , long),
				void (_rf_lds)(__XTEAM_SHARED_LDS long ,
				__XTEAM_SHARED_LDS long *),
				const long iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<long, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_ul_16x64(uint64_t v, uint64_t r_ptr, uint64_t tvals,
				uint32_t td_ptr, void (_rf)(uint64_t *, uint64_t),
				void (_rf_lds)(__XTEAM_SHARED_LDS uint64_t ,
				__XTEAM_SHARED_LDS uint64_t *),
				const uint64_t iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<uint64_t, 16, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv,
				k, numteams);
				}

				_EXT_ATTR
				__kmpc_xteamr_d_8x64(double v, double r_ptr, double tvals, uint32_t *td_ptr,
				void (_rf)(double , double),
				void (_rf_lds)(__XTEAM_SHARED_LDS double ,
				__XTEAM_SHARED_LDS double *),
				const double iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<double, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_f_8x64(float v, float r_ptr, float tvals, uint32_t *td_ptr,
				void (_rf)(float , float),
				void (_rf_lds)(__XTEAM_SHARED_LDS float ,
				__XTEAM_SHARED_LDS float *),
				const float iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<float, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_cd_8x64(double _Complex v, double _Complex *r_ptr,
				double _Complex tvals, uint32_t td_ptr,
				void (_rf)(double _Complex , double _Complex),
				void (_rf_lds)(__XTEAM_SHARED_LDS double _Complex ,
				__XTEAM_SHARED_LDS double _Complex *),
				const double _Complex iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<double _Complex, 8, 64>(v, r_ptr, tvals, td_ptr, _rf,
				_rf_lds, iv, k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_cf_8x64(float _Complex v, float _Complex *r_ptr,
				float _Complex tvals, uint32_t td_ptr,
				void (_rf)(float _Complex , float _Complex),
				void (_rf_lds)(__XTEAM_SHARED_LDS float _Complex ,
				__XTEAM_SHARED_LDS float _Complex *),
				const float _Complex iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<float _Complex, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds,
				iv, k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_i_8x64(int v, int r_ptr, int tvals, uint32_t *td_ptr,
				void (_rf)(int , int),
				void (_rf_lds)(__XTEAM_SHARED_LDS int ,
				__XTEAM_SHARED_LDS int *),
				const int iv, const uint64_t k, const uint32_t numteams) {
				_xteam_reduction<int, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_ui_8x64(uint32_t v, uint32_t r_ptr, uint32_t tvals,
				uint32_t td_ptr, void (_rf)(uint32_t *, uint32_t),
				void (_rf_lds)(__XTEAM_SHARED_LDS uint32_t ,
				__XTEAM_SHARED_LDS uint32_t *),
				const uint32_t iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<uint32_t, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv,
				k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_l_8x64(long v, long r_ptr, long tvals, uint32_t *td_ptr,
				void (_rf)(long , long),
				void (_rf_lds)(__XTEAM_SHARED_LDS long ,
				__XTEAM_SHARED_LDS long *),
				const long iv, const uint64_t k, const uint32_t numteams) {
				_xteam_reduction<long, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_ul_8x64(uint64_t v, uint64_t r_ptr, uint64_t tvals,
				uint32_t td_ptr, void (_rf)(uint64_t *, uint64_t),
				void (_rf_lds)(__XTEAM_SHARED_LDS uint64_t ,
				__XTEAM_SHARED_LDS uint64_t *),
				const uint64_t iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<uint64_t, 8, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv,
				k, numteams);
				}

				_EXT_ATTR
				__kmpc_xteamr_d_4x64(double v, double r_ptr, double tvals, uint32_t *td_ptr,
				void (_rf)(double , double),
				void (_rf_lds)(__XTEAM_SHARED_LDS double ,
				__XTEAM_SHARED_LDS double *),
				const double iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<double, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_f_4x64(float v, float r_ptr, float tvals, uint32_t *td_ptr,
				void (_rf)(float , float),
				void (_rf_lds)(__XTEAM_SHARED_LDS float ,
				__XTEAM_SHARED_LDS float *),
				const float iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<float, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_cd_4x64(double _Complex v, double _Complex *r_ptr,
				double _Complex tvals, uint32_t td_ptr,
				void (_rf)(double _Complex , double _Complex),
				void (_rf_lds)(__XTEAM_SHARED_LDS double _Complex ,
				__XTEAM_SHARED_LDS double _Complex *),
				const double _Complex iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<double _Complex, 4, 64>(v, r_ptr, tvals, td_ptr, _rf,
				_rf_lds, iv, k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_cf_4x64(float _Complex v, float _Complex *r_ptr,
				float _Complex tvals, uint32_t td_ptr,
				void (_rf)(float _Complex , float _Complex),
				void (_rf_lds)(__XTEAM_SHARED_LDS float _Complex ,
				__XTEAM_SHARED_LDS float _Complex *),
				const float _Complex iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<float _Complex, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds,
				iv, k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_i_4x64(int v, int r_ptr, int tvals, uint32_t *td_ptr,
				void (_rf)(int , int),
				void (_rf_lds)(__XTEAM_SHARED_LDS int ,
				__XTEAM_SHARED_LDS int *),
				const int iv, const uint64_t k, const uint32_t numteams) {
				_xteam_reduction<int, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_ui_4x64(uint32_t v, uint32_t r_ptr, uint32_t tvals,
				uint32_t td_ptr, void (_rf)(uint32_t *, uint32_t),
				void (_rf_lds)(__XTEAM_SHARED_LDS uint32_t ,
				__XTEAM_SHARED_LDS uint32_t *),
				const uint32_t iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<uint32_t, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv,
				k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_l_4x64(long v, long r_ptr, long tvals, uint32_t *td_ptr,
				void (_rf)(long , long),
				void (_rf_lds)(__XTEAM_SHARED_LDS long ,
				__XTEAM_SHARED_LDS long *),
				const long iv, const uint64_t k, const uint32_t numteams) {
				_xteam_reduction<long, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_ul_4x64(uint64_t v, uint64_t r_ptr, uint64_t tvals,
				uint32_t td_ptr, void (_rf)(uint64_t *, uint64_t),
				void (_rf_lds)(__XTEAM_SHARED_LDS uint64_t ,
				__XTEAM_SHARED_LDS uint64_t *),
				const uint64_t iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<uint64_t, 4, 64>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv,
				k, numteams);
				}

				_EXT_ATTR
				__kmpc_xteamr_d_32x32(double v, double r_ptr, double tvals, uint32_t *td_ptr,
				void (_rf)(double , double),
				void (_rf_lds)(__XTEAM_SHARED_LDS double ,
				__XTEAM_SHARED_LDS double *),
				const double iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<double, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_f_32x32(float v, float r_ptr, float tvals, uint32_t *td_ptr,
				void (_rf)(float , float),
				void (_rf_lds)(__XTEAM_SHARED_LDS float ,
				__XTEAM_SHARED_LDS float *),
				const float iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<float, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_cd_32x32(double _Complex v, double _Complex *r_ptr,
				double _Complex tvals, uint32_t td_ptr,
				void (_rf)(double _Complex , double _Complex),
				void (_rf_lds)(__XTEAM_SHARED_LDS double _Complex ,
				__XTEAM_SHARED_LDS double _Complex *),
				const double _Complex iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<double _Complex, 32, 32>(v, r_ptr, tvals, td_ptr, _rf,
				_rf_lds, iv, k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_cf_32x32(float _Complex v, float _Complex *r_ptr,
				float _Complex tvals, uint32_t td_ptr,
				void (_rf)(float _Complex , float _Complex),
				void (_rf_lds)(__XTEAM_SHARED_LDS float _Complex ,
				__XTEAM_SHARED_LDS float _Complex *),
				const float _Complex iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<float _Complex, 32, 32>(v, r_ptr, tvals, td_ptr, _rf,
				_rf_lds, iv, k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_i_32x32(int v, int r_ptr, int tvals, uint32_t *td_ptr,
				void (_rf)(int , int),
				void (_rf_lds)(__XTEAM_SHARED_LDS int ,
				__XTEAM_SHARED_LDS int *),
				const int iv, const uint64_t k, const uint32_t numteams) {
				_xteam_reduction<int, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_ui_32x32(uint32_t v, uint32_t r_ptr, uint32_t tvals,
				uint32_t td_ptr, void (_rf)(uint32_t *, uint32_t),
				void (_rf_lds)(__XTEAM_SHARED_LDS uint32_t ,
				__XTEAM_SHARED_LDS uint32_t *),
				const uint32_t iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<uint32_t, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv,
				k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_l_32x32(long v, long r_ptr, long tvals, uint32_t *td_ptr,
				void (_rf)(long , long),
				void (_rf_lds)(__XTEAM_SHARED_LDS long ,
				__XTEAM_SHARED_LDS long *),
				const long iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<long, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_ul_32x32(uint64_t v, uint64_t r_ptr, uint64_t tvals,
				uint32_t td_ptr, void (_rf)(uint64_t *, uint64_t),
				void (_rf_lds)(__XTEAM_SHARED_LDS uint64_t ,
				__XTEAM_SHARED_LDS uint64_t *),
				const uint64_t iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<uint64_t, 32, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv,
				k, numteams);
				}

				_EXT_ATTR
				__kmpc_xteamr_d_16x32(double v, double r_ptr, double tvals, uint32_t *td_ptr,
				void (_rf)(double , double),
				void (_rf_lds)(__XTEAM_SHARED_LDS double ,
				__XTEAM_SHARED_LDS double *),
				const double iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<double, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_f_16x32(float v, float r_ptr, float tvals, uint32_t *td_ptr,
				void (_rf)(float , float),
				void (_rf_lds)(__XTEAM_SHARED_LDS float ,
				__XTEAM_SHARED_LDS float *),
				const float iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<float, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_cd_16x32(double _Complex v, double _Complex *r_ptr,
				double _Complex tvals, uint32_t td_ptr,
				void (_rf)(double _Complex , double _Complex),
				void (_rf_lds)(__XTEAM_SHARED_LDS double _Complex ,
				__XTEAM_SHARED_LDS double _Complex *),
				const double _Complex iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<double _Complex, 16, 32>(v, r_ptr, tvals, td_ptr, _rf,
				_rf_lds, iv, k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_cf_16x32(float _Complex v, float _Complex *r_ptr,
				float _Complex tvals, uint32_t td_ptr,
				void (_rf)(float _Complex , float _Complex),
				void (_rf_lds)(__XTEAM_SHARED_LDS float _Complex ,
				__XTEAM_SHARED_LDS float _Complex *),
				const float _Complex iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<float _Complex, 16, 32>(v, r_ptr, tvals, td_ptr, _rf,
				_rf_lds, iv, k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_i_16x32(int v, int r_ptr, int tvals, uint32_t *td_ptr,
				void (_rf)(int , int),
				void (_rf_lds)(__XTEAM_SHARED_LDS int ,
				__XTEAM_SHARED_LDS int *),
				const int iv, const uint64_t k, const uint32_t numteams) {
				_xteam_reduction<int, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_ui_16x32(uint32_t v, uint32_t r_ptr, uint32_t tvals,
				uint32_t td_ptr, void (_rf)(uint32_t *, uint32_t),
				void (_rf_lds)(__XTEAM_SHARED_LDS uint32_t ,
				__XTEAM_SHARED_LDS uint32_t *),
				const uint32_t iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<uint32_t, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv,
				k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_l_16x32(long v, long r_ptr, long tvals, uint32_t *td_ptr,
				void (_rf)(long , long),
				void (_rf_lds)(__XTEAM_SHARED_LDS long ,
				__XTEAM_SHARED_LDS long *),
				const long iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<long, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_ul_16x32(uint64_t v, uint64_t r_ptr, uint64_t tvals,
				uint32_t td_ptr, void (_rf)(uint64_t *, uint64_t),
				void (_rf_lds)(__XTEAM_SHARED_LDS uint64_t ,
				__XTEAM_SHARED_LDS uint64_t *),
				const uint64_t iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<uint64_t, 16, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv,
				k, numteams);
				}

				_EXT_ATTR
				__kmpc_xteamr_d_8x32(double v, double r_ptr, double tvals, uint32_t *td_ptr,
				void (_rf)(double , double),
				void (_rf_lds)(__XTEAM_SHARED_LDS double ,
				__XTEAM_SHARED_LDS double *),
				const double iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<double, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_f_8x32(float v, float r_ptr, float tvals, uint32_t *td_ptr,
				void (_rf)(float , float),
				void (_rf_lds)(__XTEAM_SHARED_LDS float ,
				__XTEAM_SHARED_LDS float *),
				const float iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<float, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_cd_8x32(double _Complex v, double _Complex *r_ptr,
				double _Complex tvals, uint32_t td_ptr,
				void (_rf)(double _Complex , double _Complex),
				void (_rf_lds)(__XTEAM_SHARED_LDS double _Complex ,
				__XTEAM_SHARED_LDS double _Complex *),
				const double _Complex iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<double _Complex, 8, 32>(v, r_ptr, tvals, td_ptr, _rf,
				_rf_lds, iv, k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_cf_8x32(float _Complex v, float _Complex *r_ptr,
				float _Complex tvals, uint32_t td_ptr,
				void (_rf)(float _Complex , float _Complex),
				void (_rf_lds)(__XTEAM_SHARED_LDS float _Complex ,
				__XTEAM_SHARED_LDS float _Complex *),
				const float _Complex iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<float _Complex, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds,
				iv, k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_i_8x32(int v, int r_ptr, int tvals, uint32_t *td_ptr,
				void (_rf)(int , int),
				void (_rf_lds)(__XTEAM_SHARED_LDS int ,
				__XTEAM_SHARED_LDS int *),
				const int iv, const uint64_t k, const uint32_t numteams) {
				_xteam_reduction<int, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_ui_8x32(uint32_t v, uint32_t r_ptr, uint32_t tvals,
				uint32_t td_ptr, void (_rf)(uint32_t *, uint32_t),
				void (_rf_lds)(__XTEAM_SHARED_LDS uint32_t ,
				__XTEAM_SHARED_LDS uint32_t *),
				const uint32_t iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<uint32_t, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv,
				k, numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_l_8x32(long v, long r_ptr, long tvals, uint32_t *td_ptr,
				void (_rf)(long , long),
				void (_rf_lds)(__XTEAM_SHARED_LDS long ,
				__XTEAM_SHARED_LDS long *),
				const long iv, const uint64_t k, const uint32_t numteams) {
				_xteam_reduction<long, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv, k,
				numteams);
				}
				_EXT_ATTR
				__kmpc_xteamr_ul_8x32(uint64_t v, uint64_t r_ptr, uint64_t tvals,
				uint32_t td_ptr, void (_rf)(uint64_t *, uint64_t),
				void (_rf_lds)(__XTEAM_SHARED_LDS uint64_t ,
				__XTEAM_SHARED_LDS uint64_t *),
				const uint64_t iv, const uint64_t k,
				const uint32_t numteams) {
				_xteam_reduction<uint64_t, 8, 32>(v, r_ptr, tvals, td_ptr, _rf, _rf_lds, iv,
				k, numteams);
				}

				// Built-in pair reduction functions used as function pointers for
				// cross team reduction functions.

				#define _RF_LDS volatile __attribute__((address_space(3)))

				_EXT_ATTR __kmpc_rfun_sum_d(double val, double otherval) { val += otherval; }
				_EXT_ATTR __kmpc_rfun_sum_lds_d(_RF_LDS double val, _RF_LDS double otherval) {
				val += otherval;
				}
				_EXT_ATTR __kmpc_rfun_sum_f(float val, float otherval) { val += otherval; }
				_EXT_ATTR __kmpc_rfun_sum_lds_f(_RF_LDS float val, _RF_LDS float otherval) {
				val += otherval;
				}
				_EXT_ATTR __kmpc_rfun_sum_cd(double _Complex *val, double _Complex otherval) {
				*val += otherval;
				}
				_EXT_ATTR __kmpc_rfun_sum_lds_cd(_RF_LDS double _Complex *val,
				_RF_LDS double _Complex *otherval) {
				val += otherval;
				}
				_EXT_ATTR __kmpc_rfun_sum_cf(float _Complex *val, float _Complex otherval) {
				*val += otherval;
				}
				_EXT_ATTR __kmpc_rfun_sum_lds_cf(_RF_LDS float _Complex *val,
				_RF_LDS float _Complex *otherval) {
				val += otherval;
				}
				_EXT_ATTR __kmpc_rfun_sum_i(int val, int otherval) { val += otherval; }
				_EXT_ATTR __kmpc_rfun_sum_lds_i(_RF_LDS int val, _RF_LDS int otherval) {
				val += otherval;
				}
				_EXT_ATTR __kmpc_rfun_sum_ui(unsigned int *val, unsigned int otherval) {
				*val += otherval;
				}
				_EXT_ATTR __kmpc_rfun_sum_lds_ui(_RF_LDS unsigned int *val,
				_RF_LDS unsigned int *otherval) {
				val += otherval;
				}
				_EXT_ATTR __kmpc_rfun_sum_l(long val, long otherval) { val += otherval; }
				_EXT_ATTR __kmpc_rfun_sum_lds_l(_RF_LDS long val, _RF_LDS long otherval) {
				val += otherval;
				}
				_EXT_ATTR __kmpc_rfun_sum_ul(unsigned long *val, unsigned long otherval) {
				*val += otherval;
				}
				_EXT_ATTR __kmpc_rfun_sum_lds_ul(_RF_LDS unsigned long *val,
				_RF_LDS unsigned long *otherval) {
				val += otherval;
				}

				_EXT_ATTR __kmpc_rfun_min_d(double *val, double otherval) {
				val = (otherval < val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_min_lds_d(_RF_LDS double val, _RF_LDS double otherval) {
				val = (otherval < val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_min_f(float *val, float otherval) {
				val = (otherval < val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_min_lds_f(_RF_LDS float val, _RF_LDS float otherval) {
				val = (otherval < val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_min_i(int *val, int otherval) {
				val = (otherval < val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_min_lds_i(_RF_LDS int val, _RF_LDS int otherval) {
				val = (otherval < val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_min_ui(unsigned int *val, unsigned int otherval) {
				val = (otherval < val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_min_lds_ui(_RF_LDS unsigned int *val,
				_RF_LDS unsigned int *otherval) {
				val = (otherval < val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_min_l(long *val, long otherval) {
				val = (otherval < val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_min_lds_l(_RF_LDS long val, _RF_LDS long otherval) {
				val = (otherval < val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_min_ul(unsigned long *val, unsigned long otherval) {
				val = (otherval < val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_min_lds_ul(_RF_LDS unsigned long *val,
				_RF_LDS unsigned long *otherval) {
				val = (otherval < val) ? otherval : *val;
				}

				_EXT_ATTR __kmpc_rfun_max_d(double *val, double otherval) {
				val = (otherval > val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_max_lds_d(_RF_LDS double val, _RF_LDS double otherval) {
				val = (otherval > val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_max_f(float *val, float otherval) {
				val = (otherval > val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_max_lds_f(_RF_LDS float val, _RF_LDS float otherval) {
				val = (otherval > val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_max_i(int *val, int otherval) {
				val = (otherval > val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_max_lds_i(_RF_LDS int val, _RF_LDS int otherval) {
				val = (otherval > val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_max_ui(unsigned int *val, unsigned int otherval) {
				val = (otherval > val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_max_lds_ui(_RF_LDS unsigned int *val,
				_RF_LDS unsigned int *otherval) {
				val = (otherval > val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_max_l(long *val, long otherval) {
				val = (otherval > val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_max_lds_l(_RF_LDS long val, _RF_LDS long otherval) {
				val = (otherval > val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_max_ul(unsigned long *val, unsigned long otherval) {
				val = (otherval > val) ? otherval : *val;
				}
				_EXT_ATTR __kmpc_rfun_max_lds_ul(_RF_LDS unsigned long *val,
				_RF_LDS unsigned long *otherval) {
				val = (otherval > val) ? otherval : *val;
				}

				#undef _EXT_ATTR
				#undef _RF_LDS

				#pragma omp end declare target

openmp/libomptarget/test/lit.cfg

Show First 20 Lines • Show All 93 Lines • ▼ Show 20 Lines	elif config.operating_system == 'Darwin':
append_dynamic_library_path('DYLD_LIBRARY_PATH', \		append_dynamic_library_path('DYLD_LIBRARY_PATH', \
config.omp_host_rtl_directory, ";")		config.omp_host_rtl_directory, ";")
config.test_flags += " -Wl,-rpath," + config.library_dir		config.test_flags += " -Wl,-rpath," + config.library_dir
config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory		config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
else: # Unices		else: # Unices
config.test_flags += " -Wl,-rpath," + config.library_dir		config.test_flags += " -Wl,-rpath," + config.library_dir
config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory		config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
config.test_flags += " -Wl,-rpath," + config.llvm_lib_directory		config.test_flags += " -Wl,-rpath," + config.llvm_lib_directory
		config.test_flags += " -latomic"
if config.cuda_libdir:		if config.cuda_libdir:
config.test_flags += " -Wl,-rpath," + config.cuda_libdir		config.test_flags += " -Wl,-rpath," + config.cuda_libdir
if config.libomptarget_current_target.startswith('amdgcn'):		if config.libomptarget_current_target.startswith('amdgcn'):
config.test_flags += " --libomptarget-amdgcn-bc-path=" + config.library_dir		config.test_flags += " --libomptarget-amdgcn-bc-path=" + config.library_dir
if config.libomptarget_current_target.startswith('nvptx'):		if config.libomptarget_current_target.startswith('nvptx'):
config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir		config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir
if config.libomptarget_current_target.endswith('-LTO'):		if config.libomptarget_current_target.endswith('-LTO'):
config.test_flags += " -foffload-lto"		config.test_flags += " -foffload-lto"
▲ Show 20 Lines • Show All 170 Lines • Show Last 20 Lines

openmp/libomptarget/test/xteamr/test_xteamr.h

This file was added.


				// Header file: overload_to_externs.h
				// generated by utility gen_externs

				#define _CD double _Complex
				#define _CF float _Complex
				#define _UI unsigned int
				#define _UL unsigned long
				#define _INLINE_ATTR_ __attribute__((flatten, always_inline))

				// Headers for extern xteamr functions defined in libomptarget DeviceRTL
				// are defined here in test application because user apps cannot include
				// the DeviceRTL Interface.h header file.

				#if defined(__AMDGCN__) \|\| defined(__NVPTX__)
				extern "C" {
				#define _RF_LDS volatile __attribute__((address_space(3)))
				void _INLINE_ATTR_ __kmpc_xteamr_d_16x64(
				double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *), const double iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_f_16x64(
				float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *), const float iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_cd_16x64(
				_CD v, _CD r_ptr, _CD tvs, uint32_t td, void (_rf)(_CD *, _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *), const _CD iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_cf_16x64(
				_CF v, _CF r_ptr, _CF tvs, uint32_t td, void (_rf)(_CF *, _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *), const _CF iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_i_16x64(
				int v, int r_ptr, int tvs, uint32_t td, void (_rf)(int *, int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *), const int iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_ui_16x64(
				_UI v, _UI r_ptr, _UI tvs, uint32_t td, void (_rf)(_UI *, _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *), const _UI iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_l_16x64(
				long v, long r_ptr, long tvs, uint32_t td, void (_rf)(long *, long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *), const long iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_ul_16x64(
				_UL v, _UL r_ptr, _UL tvs, uint32_t td, void (_rf)(_UL *, _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *), const _UL iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_d_32x32(
				double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *), const double iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_f_32x32(
				float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *), const float iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_cd_32x32(
				_CD v, _CD r_ptr, _CD tvs, uint32_t td, void (_rf)(_CD *, _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *), const _CD iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_cf_32x32(
				_CF v, _CF r_ptr, _CF tvs, uint32_t td, void (_rf)(_CF *, _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *), const _CF iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_i_32x32(
				int v, int r_ptr, int tvs, uint32_t td, void (_rf)(int *, int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *), const int iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_ui_32x32(
				_UI v, _UI r_ptr, _UI tvs, uint32_t td, void (_rf)(_UI *, _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *), const _UI iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_l_32x32(
				long v, long r_ptr, long tvs, uint32_t td, void (_rf)(long *, long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *), const long iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_ul_32x32(
				_UL v, _UL r_ptr, _UL tvs, uint32_t td, void (_rf)(_UL *, _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *), const _UL iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_d_8x64(
				double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *), const double iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_
				__kmpc_xteamr_f_8x64(float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *),
				const float iv, const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_cd_8x64(
				_CD v, _CD r_ptr, _CD tvs, uint32_t td, void (_rf)(_CD *, _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *), const _CD iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_cf_8x64(
				_CF v, _CF r_ptr, _CF tvs, uint32_t td, void (_rf)(_CF *, _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *), const _CF iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_i_8x64(
				int v, int r_ptr, int tvs, uint32_t td, void (_rf)(int *, int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *), const int iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_ui_8x64(
				_UI v, _UI r_ptr, _UI tvs, uint32_t td, void (_rf)(_UI *, _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *), const _UI iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_l_8x64(
				long v, long r_ptr, long tvs, uint32_t td, void (_rf)(long *, long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *), const long iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_ul_8x64(
				_UL v, _UL r_ptr, _UL tvs, uint32_t td, void (_rf)(_UL *, _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *), const _UL iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_d_16x32(
				double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *), const double iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_f_16x32(
				float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *), const float iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_cd_16x32(
				_CD v, _CD r_ptr, _CD tvs, uint32_t td, void (_rf)(_CD *, _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *), const _CD iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_cf_16x32(
				_CF v, _CF r_ptr, _CF tvs, uint32_t td, void (_rf)(_CF *, _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *), const _CF iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_i_16x32(
				int v, int r_ptr, int tvs, uint32_t td, void (_rf)(int *, int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *), const int iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_ui_16x32(
				_UI v, _UI r_ptr, _UI tvs, uint32_t td, void (_rf)(_UI *, _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *), const _UI iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_l_16x32(
				long v, long r_ptr, long tvs, uint32_t td, void (_rf)(long *, long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *), const long iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_ul_16x32(
				_UL v, _UL r_ptr, _UL tvs, uint32_t td, void (_rf)(_UL *, _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *), const _UL iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_d_4x64(
				double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *), const double iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_
				__kmpc_xteamr_f_4x64(float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *),
				const float iv, const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_cd_4x64(
				_CD v, _CD r_ptr, _CD tvs, uint32_t td, void (_rf)(_CD *, _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *), const _CD iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_cf_4x64(
				_CF v, _CF r_ptr, _CF tvs, uint32_t td, void (_rf)(_CF *, _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *), const _CF iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_i_4x64(
				int v, int r_ptr, int tvs, uint32_t td, void (_rf)(int *, int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *), const int iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_ui_4x64(
				_UI v, _UI r_ptr, _UI tvs, uint32_t td, void (_rf)(_UI *, _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *), const _UI iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_l_4x64(
				long v, long r_ptr, long tvs, uint32_t td, void (_rf)(long *, long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *), const long iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_ul_4x64(
				_UL v, _UL r_ptr, _UL tvs, uint32_t td, void (_rf)(_UL *, _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *), const _UL iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_d_8x32(
				double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *), const double iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_
				__kmpc_xteamr_f_8x32(float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *),
				const float iv, const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_cd_8x32(
				_CD v, _CD r_ptr, _CD tvs, uint32_t td, void (_rf)(_CD *, _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *), const _CD iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_cf_8x32(
				_CF v, _CF r_ptr, _CF tvs, uint32_t td, void (_rf)(_CF *, _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *), const _CF iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_i_8x32(
				int v, int r_ptr, int tvs, uint32_t td, void (_rf)(int *, int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *), const int iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_ui_8x32(
				_UI v, _UI r_ptr, _UI tvs, uint32_t td, void (_rf)(_UI *, _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *), const _UI iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_l_8x32(
				long v, long r_ptr, long tvs, uint32_t td, void (_rf)(long *, long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *), const long iv,
				const uint64_t k, const uint32_t numteams);
				void _INLINE_ATTR_ __kmpc_xteamr_ul_8x32(
				_UL v, _UL r_ptr, _UL tvs, uint32_t td, void (_rf)(_UL *, _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *), const _UL iv,
				const uint64_t k, const uint32_t numteams);
				void __kmpc_rfun_sum_d(double *val, double otherval);
				void __kmpc_rfun_sum_lds_d(_RF_LDS double val, _RF_LDS double otherval);
				void __kmpc_rfun_sum_f(float *val, float otherval);
				void __kmpc_rfun_sum_lds_f(_RF_LDS float val, _RF_LDS float otherval);
				void __kmpc_rfun_sum_cd(_CD *val, _CD otherval);
				void __kmpc_rfun_sum_lds_cd(_RF_LDS _CD val, _RF_LDS _CD otherval);
				void __kmpc_rfun_sum_cf(_CF *val, _CF otherval);
				void __kmpc_rfun_sum_lds_cf(_RF_LDS _CF val, _RF_LDS _CF otherval);
				void __kmpc_rfun_sum_i(int *val, int otherval);
				void __kmpc_rfun_sum_lds_i(_RF_LDS int val, _RF_LDS int otherval);
				void __kmpc_rfun_sum_ui(_UI *val, _UI otherval);
				void __kmpc_rfun_sum_lds_ui(_RF_LDS _UI val, _RF_LDS _UI otherval);
				void __kmpc_rfun_sum_l(long *val, long otherval);
				void __kmpc_rfun_sum_lds_l(_RF_LDS long val, _RF_LDS long otherval);
				void __kmpc_rfun_sum_ul(_UL *val, _UL otherval);
				void __kmpc_rfun_sum_lds_ul(_RF_LDS _UL val, _RF_LDS _UL otherval);
				void __kmpc_rfun_max_d(double *val, double otherval);
				void __kmpc_rfun_max_lds_d(_RF_LDS double val, _RF_LDS double otherval);
				void __kmpc_rfun_max_f(float *val, float otherval);
				void __kmpc_rfun_max_lds_f(_RF_LDS float val, _RF_LDS float otherval);
				void __kmpc_rfun_max_i(int *val, int otherval);
				void __kmpc_rfun_max_lds_i(_RF_LDS int val, _RF_LDS int otherval);
				void __kmpc_rfun_max_ui(_UI *val, _UI otherval);
				void __kmpc_rfun_max_lds_ui(_RF_LDS _UI val, _RF_LDS _UI otherval);
				void __kmpc_rfun_max_l(long *val, long otherval);
				void __kmpc_rfun_max_lds_l(_RF_LDS long val, _RF_LDS long otherval);
				void __kmpc_rfun_max_ul(_UL *val, _UL otherval);
				void __kmpc_rfun_max_lds_ul(_RF_LDS _UL val, _RF_LDS _UL otherval);
				void __kmpc_rfun_min_d(double *val, double otherval);
				void __kmpc_rfun_min_lds_d(_RF_LDS double val, _RF_LDS double otherval);
				void __kmpc_rfun_min_f(float *val, float otherval);
				void __kmpc_rfun_min_lds_f(_RF_LDS float val, _RF_LDS float otherval);
				void __kmpc_rfun_min_i(int *val, int otherval);
				void __kmpc_rfun_min_lds_i(_RF_LDS int val, _RF_LDS int otherval);
				void __kmpc_rfun_min_ui(_UI *val, _UI otherval);
				void __kmpc_rfun_min_lds_ui(_RF_LDS _UI val, _RF_LDS _UI otherval);
				void __kmpc_rfun_min_l(long *val, long otherval);
				void __kmpc_rfun_min_lds_l(_RF_LDS long val, _RF_LDS long otherval);
				void __kmpc_rfun_min_ul(_UL *val, _UL otherval);
				void __kmpc_rfun_min_lds_ul(_RF_LDS _UL val, _RF_LDS _UL otherval);
				#undef _RF_LDS
				int __kmpc_get_warp_size();
				} // end extern C

				#else

				// For host compilation, define null functions for host linking.

				extern "C" {
				#undef _RF_LDS
				#define _RF_LDS
				void __kmpc_xteamr_d_16x64(double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *),
				const double iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_f_16x64(float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *),
				const float iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_cd_16x64(_CD v, _CD r_ptr, _CD tvs, uint32_t *td,
				void (_rf)(_CD , _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *),
				const _CD iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_cf_16x64(_CF v, _CF r_ptr, _CF tvs, uint32_t *td,
				void (_rf)(_CF , _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *),
				const _CF iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_i_16x64(int v, int r_ptr, int tvs, uint32_t *td,
				void (_rf)(int , int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *),
				const int iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_ui_16x64(_UI v, _UI r_ptr, _UI tvs, uint32_t *td,
				void (_rf)(_UI , _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *),
				const _UI iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_l_16x64(long v, long r_ptr, long tvs, uint32_t *td,
				void (_rf)(long , long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *),
				const long iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_ul_16x64(_UL v, _UL r_ptr, _UL tvs, uint32_t *td,
				void (_rf)(_UL , _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *),
				const _UL iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_d_32x32(double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *),
				const double iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_f_32x32(float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *),
				const float iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_cd_32x32(_CD v, _CD r_ptr, _CD tvs, uint32_t *td,
				void (_rf)(_CD , _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *),
				const _CD iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_cf_32x32(_CF v, _CF r_ptr, _CF tvs, uint32_t *td,
				void (_rf)(_CF , _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *),
				const _CF iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_i_32x32(int v, int r_ptr, int tvs, uint32_t *td,
				void (_rf)(int , int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *),
				const int iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_ui_32x32(_UI v, _UI r_ptr, _UI tvs, uint32_t *td,
				void (_rf)(_UI , _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *),
				const _UI iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_l_32x32(long v, long r_ptr, long tvs, uint32_t *td,
				void (_rf)(long , long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *),
				const long iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_ul_32x32(_UL v, _UL r_ptr, _UL tvs, uint32_t *td,
				void (_rf)(_UL , _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *),
				const _UL iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_d_8x64(double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *),
				const double iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_f_8x64(float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *),
				const float iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_cd_8x64(_CD v, _CD r_ptr, _CD tvs, uint32_t *td,
				void (_rf)(_CD , _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *),
				const _CD iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_cf_8x64(_CF v, _CF r_ptr, _CF tvs, uint32_t *td,
				void (_rf)(_CF , _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *),
				const _CF iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_i_8x64(int v, int r_ptr, int tvs, uint32_t *td,
				void (_rf)(int , int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *),
				const int iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_ui_8x64(_UI v, _UI r_ptr, _UI tvs, uint32_t *td,
				void (_rf)(_UI , _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *),
				const _UI iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_l_8x64(long v, long r_ptr, long tvs, uint32_t *td,
				void (_rf)(long , long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *),
				const long iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_ul_8x64(_UL v, _UL r_ptr, _UL tvs, uint32_t *td,
				void (_rf)(_UL , _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *),
				const _UL iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_d_16x32(double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *),
				const double iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_f_16x32(float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *),
				const float iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_cd_16x32(_CD v, _CD r_ptr, _CD tvs, uint32_t *td,
				void (_rf)(_CD , _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *),
				const _CD iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_cf_16x32(_CF v, _CF r_ptr, _CF tvs, uint32_t *td,
				void (_rf)(_CF , _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *),
				const _CF iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_i_16x32(int v, int r_ptr, int tvs, uint32_t *td,
				void (_rf)(int , int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *),
				const int iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_ui_16x32(_UI v, _UI r_ptr, _UI tvs, uint32_t *td,
				void (_rf)(_UI , _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *),
				const _UI iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_l_16x32(long v, long r_ptr, long tvs, uint32_t *td,
				void (_rf)(long , long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *),
				const long iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_ul_16x32(_UL v, _UL r_ptr, _UL tvs, uint32_t *td,
				void (_rf)(_UL , _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *),
				const _UL iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_d_4x64(double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *),
				const double iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_f_4x64(float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *),
				const float iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_cd_4x64(_CD v, _CD r_ptr, _CD tvs, uint32_t *td,
				void (_rf)(_CD , _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *),
				const _CD iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_cf_4x64(_CF v, _CF r_ptr, _CF tvs, uint32_t *td,
				void (_rf)(_CF , _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *),
				const _CF iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_i_4x64(int v, int r_ptr, int tvs, uint32_t *td,
				void (_rf)(int , int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *),
				const int iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_ui_4x64(_UI v, _UI r_ptr, _UI tvs, uint32_t *td,
				void (_rf)(_UI , _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *),
				const _UI iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_l_4x64(long v, long r_ptr, long tvs, uint32_t *td,
				void (_rf)(long , long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *),
				const long iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_ul_4x64(_UL v, _UL r_ptr, _UL tvs, uint32_t *td,
				void (_rf)(_UL , _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *),
				const _UL iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_d_8x32(double v, double r_ptr, double tvs, uint32_t *td,
				void (_rf)(double , double),
				void (_rf_lds)(_RF_LDS double , _RF_LDS double *),
				const double iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_f_8x32(float v, float r_ptr, float tvs, uint32_t *td,
				void (_rf)(float , float),
				void (_rf_lds)(_RF_LDS float , _RF_LDS float *),
				const float iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_cd_8x32(_CD v, _CD r_ptr, _CD tvs, uint32_t *td,
				void (_rf)(_CD , _CD),
				void (_rf_lds)(_RF_LDS _CD , _RF_LDS _CD *),
				const _CD iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_cf_8x32(_CF v, _CF r_ptr, _CF tvs, uint32_t *td,
				void (_rf)(_CF , _CF),
				void (_rf_lds)(_RF_LDS _CF , _RF_LDS _CF *),
				const _CF iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_i_8x32(int v, int r_ptr, int tvs, uint32_t *td,
				void (_rf)(int , int),
				void (_rf_lds)(_RF_LDS int , _RF_LDS int *),
				const int iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_ui_8x32(_UI v, _UI r_ptr, _UI tvs, uint32_t *td,
				void (_rf)(_UI , _UI),
				void (_rf_lds)(_RF_LDS _UI , _RF_LDS _UI *),
				const _UI iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_l_8x32(long v, long r_ptr, long tvs, uint32_t *td,
				void (_rf)(long , long),
				void (_rf_lds)(_RF_LDS long , _RF_LDS long *),
				const long iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_xteamr_ul_8x32(_UL v, _UL r_ptr, _UL tvs, uint32_t *td,
				void (_rf)(_UL , _UL),
				void (_rf_lds)(_RF_LDS _UL , _RF_LDS _UL *),
				const _UL iv, const uint64_t k,
				const uint32_t numteams){};
				void __kmpc_rfun_sum_d(double *val, double otherval){}
				void __kmpc_rfun_sum_lds_d(_RF_LDS double val, _RF_LDS double otherval){}
				void __kmpc_rfun_sum_f(float *val, float otherval){}
				void __kmpc_rfun_sum_lds_f(_RF_LDS float val, _RF_LDS float otherval){}
				void __kmpc_rfun_sum_cd(_CD *val, _CD otherval){}
				void __kmpc_rfun_sum_lds_cd(_RF_LDS _CD val, _RF_LDS _CD otherval){}
				void __kmpc_rfun_sum_cf(_CF *val, _CF otherval){}
				void __kmpc_rfun_sum_lds_cf(_RF_LDS _CF val, _RF_LDS _CF otherval){}
				void __kmpc_rfun_sum_i(int *val, int otherval){}
				void __kmpc_rfun_sum_lds_i(_RF_LDS int val, _RF_LDS int otherval){}
				void __kmpc_rfun_sum_ui(_UI *val, _UI otherval){}
				void __kmpc_rfun_sum_lds_ui(_RF_LDS _UI val, _RF_LDS _UI otherval){}
				void __kmpc_rfun_sum_l(long *val, long otherval){}
				void __kmpc_rfun_sum_lds_l(_RF_LDS long val, _RF_LDS long otherval){}
				void __kmpc_rfun_sum_ul(_UL *val, _UL otherval){}
				void __kmpc_rfun_sum_lds_ul(_RF_LDS _UL val, _RF_LDS _UL otherval){}
				void __kmpc_rfun_max_d(double *val, double otherval){}
				void __kmpc_rfun_max_lds_d(_RF_LDS double val, _RF_LDS double otherval){}
				void __kmpc_rfun_max_f(float *val, float otherval){}
				void __kmpc_rfun_max_lds_f(_RF_LDS float val, _RF_LDS float otherval){}
				void __kmpc_rfun_max_i(int *val, int otherval){}
				void __kmpc_rfun_max_lds_i(_RF_LDS int val, _RF_LDS int otherval){}
				void __kmpc_rfun_max_ui(_UI *val, _UI otherval){}
				void __kmpc_rfun_max_lds_ui(_RF_LDS _UI val, _RF_LDS _UI otherval){}
				void __kmpc_rfun_max_l(long *val, long otherval){}
				void __kmpc_rfun_max_lds_l(_RF_LDS long val, _RF_LDS long otherval){}
				void __kmpc_rfun_max_ul(_UL *val, _UL otherval){}
				void __kmpc_rfun_max_lds_ul(_RF_LDS _UL val, _RF_LDS _UL otherval){}
				void __kmpc_rfun_min_d(double *val, double otherval){}
				void __kmpc_rfun_min_lds_d(_RF_LDS double val, _RF_LDS double otherval){}
				void __kmpc_rfun_min_f(float *val, float otherval){}
				void __kmpc_rfun_min_lds_f(_RF_LDS float val, _RF_LDS float otherval){}
				void __kmpc_rfun_min_i(int *val, int otherval){}
				void __kmpc_rfun_min_lds_i(_RF_LDS int val, _RF_LDS int otherval){}
				void __kmpc_rfun_min_ui(_UI *val, _UI otherval){}
				void __kmpc_rfun_min_lds_ui(_RF_LDS _UI val, _RF_LDS _UI otherval){}
				void __kmpc_rfun_min_l(long *val, long otherval){}
				void __kmpc_rfun_min_lds_l(_RF_LDS long val, _RF_LDS long otherval){}
				void __kmpc_rfun_min_ul(_UL *val, _UL otherval){}
				void __kmpc_rfun_min_lds_ul(_RF_LDS _UL val, _RF_LDS _UL otherval){}
				#undef _RF_LDS
				int __kmpc_get_warp_size(){
				printf("ERROR: executing _kmpc_get_warp_size on host\n");
				return -1;}
				} // end extern C

				#endif // of definitions for host null functions

				// These overloaded function definitions are for this test framework
				// (xteamr.cpp) to invoke the extern DexviceRTL helper functions.

				void _INLINE_ATTR_ _overload_to_extern_sum_16x64(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_16x64(val, rv, tvs, td, __kmpc_rfun_sum_d,
				__kmpc_rfun_sum_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x64(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_16x64(val, rv, tvs, td, __kmpc_rfun_sum_f,
				__kmpc_rfun_sum_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x64(_CD val, _CD rv, _CD tvs,
				uint32_t *td, const _CD iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_cd_16x64(val, rv, tvs, td, __kmpc_rfun_sum_cd,
				__kmpc_rfun_sum_lds_cd, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x64(_CF val, _CF rv, _CF tvs,
				uint32_t *td, const _CF iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_cf_16x64(val, rv, tvs, td, __kmpc_rfun_sum_cf,
				__kmpc_rfun_sum_lds_cf, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x64(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_16x64(val, rv, tvs, td, __kmpc_rfun_sum_i,
				__kmpc_rfun_sum_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x64(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_16x64(val, rv, tvs, td, __kmpc_rfun_sum_ui,
				__kmpc_rfun_sum_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x64(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_16x64(val, rv, tvs, td, __kmpc_rfun_sum_l,
				__kmpc_rfun_sum_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x64(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_16x64(val, rv, tvs, td, __kmpc_rfun_sum_ul,
				__kmpc_rfun_sum_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_32x32(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_32x32(val, rv, tvs, td, __kmpc_rfun_sum_d,
				__kmpc_rfun_sum_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_32x32(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_32x32(val, rv, tvs, td, __kmpc_rfun_sum_f,
				__kmpc_rfun_sum_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_32x32(_CD val, _CD rv, _CD tvs,
				uint32_t *td, const _CD iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_cd_32x32(val, rv, tvs, td, __kmpc_rfun_sum_cd,
				__kmpc_rfun_sum_lds_cd, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_32x32(_CF val, _CF rv, _CF tvs,
				uint32_t *td, const _CF iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_cf_32x32(val, rv, tvs, td, __kmpc_rfun_sum_cf,
				__kmpc_rfun_sum_lds_cf, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_32x32(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_32x32(val, rv, tvs, td, __kmpc_rfun_sum_i,
				__kmpc_rfun_sum_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_32x32(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_32x32(val, rv, tvs, td, __kmpc_rfun_sum_ui,
				__kmpc_rfun_sum_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_32x32(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_32x32(val, rv, tvs, td, __kmpc_rfun_sum_l,
				__kmpc_rfun_sum_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_32x32(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_32x32(val, rv, tvs, td, __kmpc_rfun_sum_ul,
				__kmpc_rfun_sum_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x64(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_8x64(val, rv, tvs, td, __kmpc_rfun_sum_d,
				__kmpc_rfun_sum_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x64(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_8x64(val, rv, tvs, td, __kmpc_rfun_sum_f,
				__kmpc_rfun_sum_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x64(_CD val, _CD rv, _CD tvs,
				uint32_t *td, const _CD iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_cd_8x64(val, rv, tvs, td, __kmpc_rfun_sum_cd,
				__kmpc_rfun_sum_lds_cd, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x64(_CF val, _CF rv, _CF tvs,
				uint32_t *td, const _CF iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_cf_8x64(val, rv, tvs, td, __kmpc_rfun_sum_cf,
				__kmpc_rfun_sum_lds_cf, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x64(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_8x64(val, rv, tvs, td, __kmpc_rfun_sum_i,
				__kmpc_rfun_sum_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x64(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_8x64(val, rv, tvs, td, __kmpc_rfun_sum_ui,
				__kmpc_rfun_sum_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x64(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_8x64(val, rv, tvs, td, __kmpc_rfun_sum_l,
				__kmpc_rfun_sum_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x64(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_8x64(val, rv, tvs, td, __kmpc_rfun_sum_ul,
				__kmpc_rfun_sum_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x32(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_16x32(val, rv, tvs, td, __kmpc_rfun_sum_d,
				__kmpc_rfun_sum_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x32(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_16x32(val, rv, tvs, td, __kmpc_rfun_sum_f,
				__kmpc_rfun_sum_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x32(_CD val, _CD rv, _CD tvs,
				uint32_t *td, const _CD iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_cd_16x32(val, rv, tvs, td, __kmpc_rfun_sum_cd,
				__kmpc_rfun_sum_lds_cd, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x32(_CF val, _CF rv, _CF tvs,
				uint32_t *td, const _CF iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_cf_16x32(val, rv, tvs, td, __kmpc_rfun_sum_cf,
				__kmpc_rfun_sum_lds_cf, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x32(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_16x32(val, rv, tvs, td, __kmpc_rfun_sum_i,
				__kmpc_rfun_sum_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x32(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_16x32(val, rv, tvs, td, __kmpc_rfun_sum_ui,
				__kmpc_rfun_sum_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x32(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_16x32(val, rv, tvs, td, __kmpc_rfun_sum_l,
				__kmpc_rfun_sum_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_16x32(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_16x32(val, rv, tvs, td, __kmpc_rfun_sum_ul,
				__kmpc_rfun_sum_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_4x64(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_4x64(val, rv, tvs, td, __kmpc_rfun_sum_d,
				__kmpc_rfun_sum_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_4x64(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_4x64(val, rv, tvs, td, __kmpc_rfun_sum_f,
				__kmpc_rfun_sum_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_4x64(_CD val, _CD rv, _CD tvs,
				uint32_t *td, const _CD iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_cd_4x64(val, rv, tvs, td, __kmpc_rfun_sum_cd,
				__kmpc_rfun_sum_lds_cd, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_4x64(_CF val, _CF rv, _CF tvs,
				uint32_t *td, const _CF iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_cf_4x64(val, rv, tvs, td, __kmpc_rfun_sum_cf,
				__kmpc_rfun_sum_lds_cf, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_4x64(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_4x64(val, rv, tvs, td, __kmpc_rfun_sum_i,
				__kmpc_rfun_sum_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_4x64(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_4x64(val, rv, tvs, td, __kmpc_rfun_sum_ui,
				__kmpc_rfun_sum_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_4x64(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_4x64(val, rv, tvs, td, __kmpc_rfun_sum_l,
				__kmpc_rfun_sum_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_4x64(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_4x64(val, rv, tvs, td, __kmpc_rfun_sum_ul,
				__kmpc_rfun_sum_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x32(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_8x32(val, rv, tvs, td, __kmpc_rfun_sum_d,
				__kmpc_rfun_sum_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x32(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_8x32(val, rv, tvs, td, __kmpc_rfun_sum_f,
				__kmpc_rfun_sum_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x32(_CD val, _CD rv, _CD tvs,
				uint32_t *td, const _CD iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_cd_8x32(val, rv, tvs, td, __kmpc_rfun_sum_cd,
				__kmpc_rfun_sum_lds_cd, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x32(_CF val, _CF rv, _CF tvs,
				uint32_t *td, const _CF iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_cf_8x32(val, rv, tvs, td, __kmpc_rfun_sum_cf,
				__kmpc_rfun_sum_lds_cf, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x32(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_8x32(val, rv, tvs, td, __kmpc_rfun_sum_i,
				__kmpc_rfun_sum_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x32(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_8x32(val, rv, tvs, td, __kmpc_rfun_sum_ui,
				__kmpc_rfun_sum_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x32(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_8x32(val, rv, tvs, td, __kmpc_rfun_sum_l,
				__kmpc_rfun_sum_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_sum_8x32(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_8x32(val, rv, tvs, td, __kmpc_rfun_sum_ul,
				__kmpc_rfun_sum_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_16x64(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_16x64(val, rv, tvs, td, __kmpc_rfun_max_d,
				__kmpc_rfun_max_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_16x64(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_16x64(val, rv, tvs, td, __kmpc_rfun_max_f,
				__kmpc_rfun_max_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_16x64(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_16x64(val, rv, tvs, td, __kmpc_rfun_max_i,
				__kmpc_rfun_max_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_16x64(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_16x64(val, rv, tvs, td, __kmpc_rfun_max_ui,
				__kmpc_rfun_max_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_16x64(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_16x64(val, rv, tvs, td, __kmpc_rfun_max_l,
				__kmpc_rfun_max_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_16x64(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_16x64(val, rv, tvs, td, __kmpc_rfun_max_ul,
				__kmpc_rfun_max_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_32x32(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_32x32(val, rv, tvs, td, __kmpc_rfun_max_d,
				__kmpc_rfun_max_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_32x32(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_32x32(val, rv, tvs, td, __kmpc_rfun_max_f,
				__kmpc_rfun_max_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_32x32(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_32x32(val, rv, tvs, td, __kmpc_rfun_max_i,
				__kmpc_rfun_max_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_32x32(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_32x32(val, rv, tvs, td, __kmpc_rfun_max_ui,
				__kmpc_rfun_max_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_32x32(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_32x32(val, rv, tvs, td, __kmpc_rfun_max_l,
				__kmpc_rfun_max_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_32x32(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_32x32(val, rv, tvs, td, __kmpc_rfun_max_ul,
				__kmpc_rfun_max_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_8x64(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_8x64(val, rv, tvs, td, __kmpc_rfun_max_d,
				__kmpc_rfun_max_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_8x64(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_8x64(val, rv, tvs, td, __kmpc_rfun_max_f,
				__kmpc_rfun_max_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_8x64(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_8x64(val, rv, tvs, td, __kmpc_rfun_max_i,
				__kmpc_rfun_max_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_8x64(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_8x64(val, rv, tvs, td, __kmpc_rfun_max_ui,
				__kmpc_rfun_max_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_8x64(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_8x64(val, rv, tvs, td, __kmpc_rfun_max_l,
				__kmpc_rfun_max_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_8x64(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_8x64(val, rv, tvs, td, __kmpc_rfun_max_ul,
				__kmpc_rfun_max_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_16x32(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_16x32(val, rv, tvs, td, __kmpc_rfun_max_d,
				__kmpc_rfun_max_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_16x32(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_16x32(val, rv, tvs, td, __kmpc_rfun_max_f,
				__kmpc_rfun_max_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_16x32(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_16x32(val, rv, tvs, td, __kmpc_rfun_max_i,
				__kmpc_rfun_max_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_16x32(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_16x32(val, rv, tvs, td, __kmpc_rfun_max_ui,
				__kmpc_rfun_max_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_16x32(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_16x32(val, rv, tvs, td, __kmpc_rfun_max_l,
				__kmpc_rfun_max_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_16x32(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_16x32(val, rv, tvs, td, __kmpc_rfun_max_ul,
				__kmpc_rfun_max_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_4x64(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_4x64(val, rv, tvs, td, __kmpc_rfun_max_d,
				__kmpc_rfun_max_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_4x64(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_4x64(val, rv, tvs, td, __kmpc_rfun_max_f,
				__kmpc_rfun_max_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_4x64(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_4x64(val, rv, tvs, td, __kmpc_rfun_max_i,
				__kmpc_rfun_max_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_4x64(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_4x64(val, rv, tvs, td, __kmpc_rfun_max_ui,
				__kmpc_rfun_max_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_4x64(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_4x64(val, rv, tvs, td, __kmpc_rfun_max_l,
				__kmpc_rfun_max_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_4x64(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_4x64(val, rv, tvs, td, __kmpc_rfun_max_ul,
				__kmpc_rfun_max_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_8x32(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_8x32(val, rv, tvs, td, __kmpc_rfun_max_d,
				__kmpc_rfun_max_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_8x32(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_8x32(val, rv, tvs, td, __kmpc_rfun_max_f,
				__kmpc_rfun_max_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_8x32(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_8x32(val, rv, tvs, td, __kmpc_rfun_max_i,
				__kmpc_rfun_max_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_8x32(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_8x32(val, rv, tvs, td, __kmpc_rfun_max_ui,
				__kmpc_rfun_max_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_8x32(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_8x32(val, rv, tvs, td, __kmpc_rfun_max_l,
				__kmpc_rfun_max_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_max_8x32(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_8x32(val, rv, tvs, td, __kmpc_rfun_max_ul,
				__kmpc_rfun_max_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_16x64(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_16x64(val, rv, tvs, td, __kmpc_rfun_min_d,
				__kmpc_rfun_min_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_16x64(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_16x64(val, rv, tvs, td, __kmpc_rfun_min_f,
				__kmpc_rfun_min_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_16x64(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_16x64(val, rv, tvs, td, __kmpc_rfun_min_i,
				__kmpc_rfun_min_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_16x64(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_16x64(val, rv, tvs, td, __kmpc_rfun_min_ui,
				__kmpc_rfun_min_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_16x64(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_16x64(val, rv, tvs, td, __kmpc_rfun_min_l,
				__kmpc_rfun_min_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_16x64(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_16x64(val, rv, tvs, td, __kmpc_rfun_min_ul,
				__kmpc_rfun_min_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_32x32(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_32x32(val, rv, tvs, td, __kmpc_rfun_min_d,
				__kmpc_rfun_min_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_32x32(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_32x32(val, rv, tvs, td, __kmpc_rfun_min_f,
				__kmpc_rfun_min_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_32x32(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_32x32(val, rv, tvs, td, __kmpc_rfun_min_i,
				__kmpc_rfun_min_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_32x32(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_32x32(val, rv, tvs, td, __kmpc_rfun_min_ui,
				__kmpc_rfun_min_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_32x32(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_32x32(val, rv, tvs, td, __kmpc_rfun_min_l,
				__kmpc_rfun_min_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_32x32(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_32x32(val, rv, tvs, td, __kmpc_rfun_min_ul,
				__kmpc_rfun_min_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_8x64(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_8x64(val, rv, tvs, td, __kmpc_rfun_min_d,
				__kmpc_rfun_min_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_8x64(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_8x64(val, rv, tvs, td, __kmpc_rfun_min_f,
				__kmpc_rfun_min_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_8x64(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_8x64(val, rv, tvs, td, __kmpc_rfun_min_i,
				__kmpc_rfun_min_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_8x64(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_8x64(val, rv, tvs, td, __kmpc_rfun_min_ui,
				__kmpc_rfun_min_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_8x64(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_8x64(val, rv, tvs, td, __kmpc_rfun_min_l,
				__kmpc_rfun_min_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_8x64(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_8x64(val, rv, tvs, td, __kmpc_rfun_min_ul,
				__kmpc_rfun_min_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_16x32(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_16x32(val, rv, tvs, td, __kmpc_rfun_min_d,
				__kmpc_rfun_min_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_16x32(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_16x32(val, rv, tvs, td, __kmpc_rfun_min_f,
				__kmpc_rfun_min_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_16x32(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_16x32(val, rv, tvs, td, __kmpc_rfun_min_i,
				__kmpc_rfun_min_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_16x32(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_16x32(val, rv, tvs, td, __kmpc_rfun_min_ui,
				__kmpc_rfun_min_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_16x32(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_16x32(val, rv, tvs, td, __kmpc_rfun_min_l,
				__kmpc_rfun_min_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_16x32(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_16x32(val, rv, tvs, td, __kmpc_rfun_min_ul,
				__kmpc_rfun_min_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_4x64(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_4x64(val, rv, tvs, td, __kmpc_rfun_min_d,
				__kmpc_rfun_min_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_4x64(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_4x64(val, rv, tvs, td, __kmpc_rfun_min_f,
				__kmpc_rfun_min_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_4x64(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_4x64(val, rv, tvs, td, __kmpc_rfun_min_i,
				__kmpc_rfun_min_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_4x64(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_4x64(val, rv, tvs, td, __kmpc_rfun_min_ui,
				__kmpc_rfun_min_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_4x64(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_4x64(val, rv, tvs, td, __kmpc_rfun_min_l,
				__kmpc_rfun_min_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_4x64(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_4x64(val, rv, tvs, td, __kmpc_rfun_min_ul,
				__kmpc_rfun_min_lds_ul, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_8x32(double val, double *rv,
				double tvs, uint32_t td,
				const double iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_d_8x32(val, rv, tvs, td, __kmpc_rfun_min_d,
				__kmpc_rfun_min_lds_d, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_8x32(float val, float *rv,
				float tvs, uint32_t td,
				const float iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_f_8x32(val, rv, tvs, td, __kmpc_rfun_min_f,
				__kmpc_rfun_min_lds_f, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_8x32(int val, int rv, int tvs,
				uint32_t *td, const int iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_i_8x32(val, rv, tvs, td, __kmpc_rfun_min_i,
				__kmpc_rfun_min_lds_i, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_8x32(_UI val, _UI rv, _UI tvs,
				uint32_t *td, const _UI iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ui_8x32(val, rv, tvs, td, __kmpc_rfun_min_ui,
				__kmpc_rfun_min_lds_ui, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_8x32(long val, long rv, long tvs,
				uint32_t *td, const long iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_l_8x32(val, rv, tvs, td, __kmpc_rfun_min_l,
				__kmpc_rfun_min_lds_l, iv, k, numteams);
				}
				void _INLINE_ATTR_ _overload_to_extern_min_8x32(_UL val, _UL rv, _UL tvs,
				uint32_t *td, const _UL iv,
				const uint64_t k,
				const uint32_t numteams) {
				__kmpc_xteamr_ul_8x32(val, rv, tvs, td, __kmpc_rfun_min_ul,
				__kmpc_rfun_min_lds_ul, iv, k, numteams);
				}
				#undef _CD
				#undef _CF
				#undef _UI
				#undef _UL
				#undef _INLINE_ATTR_

openmp/libomptarget/test/xteamr/test_xteamr.cpp

This file was added.

				//===----- test_xteamr.cpp - Test for Xteamr DeviceRTL functions ---C++ -*-===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// performance and functional tests for Xteamr reduction helper functions in
				// libomptarget/DeviceRTL/Xteamr.cpp
				//
				// RUN: %libomptarget-compileoptxx-run-and-check-nvptx64-nvidia-cuda
				// REQUIRES: nvptx64-nvidia-cuda
				// CHECK: ALL TESTS PASSED
				//
				//===----------------------------------------------------------------------===//

				#include <algorithm>
				#include <chrono>
				#include <complex>
				#include <cstdint>
				#include <cstdlib>
				#include <cstring>
				#include <iomanip>
				#include <iostream>
				#include <limits>
				#include <numeric>
				#include <omp.h>
				#include <vector>

				#include "test_xteamr.h"

				#ifndef _ARRAY_SIZE
				#define _ARRAY_SIZE 33554432
				#endif
				const uint64_t ARRAY_SIZE = _ARRAY_SIZE;
				unsigned int repeat_num_times = 12;
				unsigned int ignore_times =
				2; // ignore this many timings first

				// If we know at compile time that we have 0 index with 1 stride,
				// then generate an optimized _BIG_JUMP_LOOP.
				// This test case has index 0 and stride 1, so we set this here.
				#define __OPTIMIZE_INDEX0_STRIDE1

				// Extern Xteamr functions are designed for 1024, 512, and 256 thread blocks.
				// The default here is 512.

				#ifndef _XTEAM_NUM_THREADS
				#define _XTEAM_NUM_THREADS 512
				#endif
				#ifndef _XTEAM_NUM_TEAMS
				#define _XTEAM_NUM_TEAMS 80
				#endif

				#if _XTEAM_NUM_THREADS == 1024
				#define _SUM_OVERLOAD_64_FCT _overload_to_extern_sum_16x64
				#define _SUM_OVERLOAD_32_FCT _overload_to_extern_sum_32x32
				#define _MAX_OVERLOAD_64_FCT _overload_to_extern_max_16x64
				#define _MAX_OVERLOAD_32_FCT _overload_to_extern_max_32x32
				#define _MIN_OVERLOAD_64_FCT _overload_to_extern_min_16x64
				#define _MIN_OVERLOAD_32_FCT _overload_to_extern_min_32x32
				#elif _XTEAM_NUM_THREADS == 512
				#define _SUM_OVERLOAD_64_FCT _overload_to_extern_sum_8x64
				#define _SUM_OVERLOAD_32_FCT _overload_to_extern_sum_16x32
				#define _MAX_OVERLOAD_64_FCT _overload_to_extern_max_8x64
				#define _MAX_OVERLOAD_32_FCT _overload_to_extern_max_16x32
				#define _MIN_OVERLOAD_64_FCT _overload_to_extern_min_8x64
				#define _MIN_OVERLOAD_32_FCT _overload_to_extern_min_16x32
				#elif _XTEAM_NUM_THREADS == 256
				#define _SUM_OVERLOAD_64_FCT _overload_to_extern_sum_4x64
				#define _SUM_OVERLOAD_32_FCT _overload_to_extern_sum_8x32
				#define _MAX_OVERLOAD_64_FCT _overload_to_extern_max_4x64
				#define _MAX_OVERLOAD_32_FCT _overload_to_extern_max_8x32
				#define _MIN_OVERLOAD_64_FCT _overload_to_extern_min_4x64
				#define _MIN_OVERLOAD_32_FCT _overload_to_extern_min_8x32
				#else
				#error Invalid value for _XTEAM_NUM_THREADS. Must be 1024, 512, or 256
				#endif

				// Question to Dhruva, should the limiter include the stride?
				#if defined(__NVPTX__) && _XTEAM_NUM_THREADS == 1024
				// Cuda may restrict max threads when requesting 1024, so the bigjump
				// on the inner loop depends on the actual limited number of threads
				// determined by omp_get_num_threads(). It also requires we only call
				// the helper reducer function when k is in this range. Lastly, the
				// helper function must clear (set to rnv) unused xwave values
				// before the optimized (unrolled) xwave reduction loop. See Xteamr.cpp.
				// These three things kill performance on nvidia when requested thread=1024.
				// So codegen max request of 512 threads (16x32) for nvidia GPUs.
				#define _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(nteams) \
				if (k < (nteams * omp_get_num_threads()))
				#ifdef __OPTIMIZE_INDEX0_STRIDE1
				#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \
				for (int64_t i = k; i < size; i += nteams * omp_get_num_threads())
				#else
				#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \
				for (int64_t i = ((k * stride) + offset); i < size; \
				i += (nteams * omp_get_num_threads() * stride))
				#endif
				#else
				// Assume AMDGPU or NVIDIA=512\|256 always gets requested number of
				// threads.
				// So no conditional needed to limit reductions.
				#define _LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(nteams)

				// Format of BIG_JUMP_LOOP depends on if we optimize for 0 index 1 stride
				#if _XTEAM_NUM_THREADS == 1024
				#ifdef __OPTIMIZE_INDEX0_STRIDE1
				#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \
				for (int64_t i = k; i < size; i += nteams * 1024)
				#else
				#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \
				for (int64_t i = ((k * stride) + offset); i < size; \
				i += (nteams * 1024 * stride))
				#endif

				#elif _XTEAM_NUM_THREADS == 512
				#ifdef __OPTIMIZE_INDEX0_STRIDE1
				#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \
				for (int64_t i = k; i < size; i += nteams * 512)
				#else
				#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \
				for (int64_t i = ((k * stride) + offset); i < size; \
				i += (nteams * 512 * stride))
				#endif
				#else
				#ifdef __OPTIMIZE_INDEX0_STRIDE1
				#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \
				for (int64_t i = k; i < size; i += nteams * 256)
				#else
				#define _BIG_JUMP_LOOP(nteams, size, stride, offset) \
				for (int64_t i = ((k * stride) + offset); i < size; \
				i += (nteams * 256 * stride))
				#endif
				#endif // end if _XTEAM_NUM_THREADS == 1024, elif, else
				#endif // if defined(__NVPTX__) && _XTEAM_NUM_THREADS == 1024 else

				unsigned int test_run_rc = 0;

				template <typename T, bool> void run_tests(const uint64_t);
				template <typename TC, typename T> void run_tests_complex(const uint64_t);

				int main(int argc, char *argv[]) {
				std::cout << std::endl
				<< "TEST DOUBLE " << _XTEAM_NUM_THREADS << " THREADS" << std::endl;
				run_tests<double, false>(ARRAY_SIZE);
				std::cout << std::endl
				<< "TEST FLOAT " << _XTEAM_NUM_THREADS << " THREADS" << std::endl;
				run_tests<float, false>(ARRAY_SIZE);
				std::cout << std::endl
				<< "TEST INT " << _XTEAM_NUM_THREADS << " THREADS" << std::endl;
				run_tests<int, true>(ARRAY_SIZE);
				std::cout << std::endl
				<< "TEST UNSIGNED INT " << _XTEAM_NUM_THREADS << " THREADS"
				<< std::endl;
				run_tests<unsigned, true>(ARRAY_SIZE);
				std::cout << std::endl
				<< "TEST LONG " << _XTEAM_NUM_THREADS << " THREADS " << std::endl;
				run_tests<long, true>(ARRAY_SIZE);
				std::cout << std::endl
				<< "TEST UNSIGNED LONG " << _XTEAM_NUM_THREADS << " THREADS"
				<< std::endl;
				run_tests<unsigned long, true>(ARRAY_SIZE);
				std::cout << std::endl
				<< "TEST DOUBLE COMPLEX " << _XTEAM_NUM_THREADS << " THREADS"
				<< std::endl;
				run_tests_complex<double _Complex, double>(ARRAY_SIZE);
				std::cout << std::endl
				<< "TEST FLOAT COMPLEX " << _XTEAM_NUM_THREADS << " THREADS"
				<< std::endl;
				run_tests_complex<float _Complex, float>(ARRAY_SIZE);
				if (test_run_rc == 0)
				printf("ALL TESTS PASSED\n");
				return test_run_rc;
				}

				template <typename T> T omp_dot(T a, T b, uint64_t array_size) {
				T sum = 0.0;
				#pragma omp target teams distribute parallel for map(tofrom: sum) reduction(+:sum)
				for (int64_t i = 0; i < array_size; i++)
				sum += a[i] * b[i];
				return sum;
				}

				template <typename T> T omp_max(T *c, uint64_t array_size) {
				T maxval = std::numeric_limits<T>::lowest();
				#pragma omp target teams distribute parallel for map(tofrom \
				: maxval) \
				reduction(max \
				: maxval)
				for (int64_t i = 0; i < array_size; i++)
				maxval = (c[i] > maxval) ? c[i] : maxval;
				return maxval;
				}

				template <typename T> T omp_min(T *c, uint64_t array_size) {
				T minval = std::numeric_limits<T>::max();
				#pragma omp target teams distribute parallel for map(tofrom \
				: minval) \
				reduction(min \
				: minval)
				for (int64_t i = 0; i < array_size; i++) {
				minval = (c[i] < minval) ? c[i] : minval;
				}
				return minval;
				}

				template <typename T> T sim_dot(T a, T b, int warp_size) {
				T sum = T(0);
				int devid = 0;
				struct loop_ctl_t {
				uint32_t *td_ptr; // Atomic counter accessed on device
				uint32_t reserved; // reserved
				const int64_t stride = 1; // stride to process input vectors
				const int64_t offset = 0; // Offset to initial index of input vectors
				const int64_t size = _ARRAY_SIZE; // Size of input vector
				const T rnv = T(0); // reduction null value
				T *team_vals; // array of global team values
				};
				static uint32_t zero = 0;
				static loop_ctl_t lc0;
				static int64_t num_teams0 = 0;
				if (!num_teams0) {
				// num_teams0 = ompx_get_device_num_units(devid);
				num_teams0 = _XTEAM_NUM_TEAMS;
				lc0.td_ptr = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid);
				lc0.team_vals = (T )omp_target_alloc(sizeof(T) num_teams0, devid);
				omp_target_memcpy(lc0.td_ptr, &zero, sizeof(uint32_t), 0, 0, devid,
				omp_get_initial_device());
				}

				if (warp_size == 64) {
				#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \
				num_threads(_XTEAM_NUM_THREADS) map(tofrom:sum) map(to:lc0)
				for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
				T val0 = lc0.rnv;
				_BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset)
				val0 += a[i] * b[i];
				_SUM_OVERLOAD_64_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k,
				_XTEAM_NUM_TEAMS);
				}
				} else {
				#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \
				num_threads(_XTEAM_NUM_THREADS) map(tofrom:sum) map(to:lc0)
				for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
				T val0 = lc0.rnv;
				_BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc0.size, lc0.stride, lc0.offset)
				val0 += a[i] * b[i];
				_LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS)
				_SUM_OVERLOAD_32_FCT(val0, &sum, lc0.team_vals, lc0.td_ptr, lc0.rnv, k,
				_XTEAM_NUM_TEAMS);
				}
				}
				return sum;
				}

				template <typename T> T sim_max(T *c, int warp_size) {
				T retval = std::numeric_limits<T>::lowest();
				int devid = 0;
				struct loop_ctl_t {
				uint32_t *td_ptr; // Atomic counter accessed on device
				uint32_t reserved; // reserved
				const int64_t stride = 1; // stride to process input vectors
				const int64_t offset = 0; // Offset to index of input vectors
				const int64_t size = _ARRAY_SIZE; // Size of input vectors
				T rnv; // reduction null value
				T *team_vals; // array of global team values
				};
				static uint32_t zero = 0;
				static loop_ctl_t lc1;
				static int64_t num_teams1 = 0;
				if (!num_teams1) {
				// num_teams1 = ompx_get_device_num_units(devid);
				num_teams1 = _XTEAM_NUM_TEAMS;
				lc1.td_ptr = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid);
				lc1.rnv = std::numeric_limits<T>::lowest();
				lc1.team_vals = (T )omp_target_alloc(sizeof(T) num_teams1, devid);
				omp_target_memcpy(lc1.td_ptr, &zero, sizeof(uint32_t), 0, 0, devid,
				omp_get_initial_device());
				}
				if (warp_size == 64) {
				#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \
				num_threads(_XTEAM_NUM_THREADS) map(tofrom:retval) map(to:lc1)
				for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
				T val1 = lc1.rnv;
				_BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset)
				val1 = (c[i] > val1) ? c[i] : val1;
				_MAX_OVERLOAD_64_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k,
				_XTEAM_NUM_TEAMS);
				}
				} else {
				#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \
				num_threads(_XTEAM_NUM_THREADS) map(tofrom:retval) map(to:lc1)
				for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
				T val1 = lc1.rnv;
				_BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc1.size, lc1.stride, lc1.offset)
				val1 = (c[i] > val1) ? c[i] : val1;
				_LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS)
				_MAX_OVERLOAD_32_FCT(val1, &retval, lc1.team_vals, lc1.td_ptr, lc1.rnv, k,
				_XTEAM_NUM_TEAMS);
				}
				}
				return retval;
				}

				template <typename T> T sim_min(T *c, int warp_size) {
				T retval = std::numeric_limits<T>::max();
				int devid = 0;
				struct loop_ctl_t {
				uint32_t *td_ptr; // Atomic counter accessed on device
				uint32_t reserved; // reserved
				const int64_t stride = 1; // stride to process input vectors
				const int64_t offset = 0; // Offset to initial index of input vectors
				const int64_t size = _ARRAY_SIZE; // Size of input vectors
				T rnv; // reduction null value
				T *team_vals; // array of global team values
				};
				static uint32_t zero = 0;
				static loop_ctl_t lc2;
				static int64_t num_teams2 = 0;
				if (!num_teams2) {
				// num_teams2 = ompx_get_device_num_units(devid);
				num_teams2 = _XTEAM_NUM_TEAMS;
				lc2.td_ptr = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid);
				lc2.rnv = std::numeric_limits<T>::max();
				lc2.team_vals = (T )omp_target_alloc(sizeof(T) num_teams2, devid);
				omp_target_memcpy(lc2.td_ptr, &zero, sizeof(uint32_t), 0, 0, devid,
				omp_get_initial_device());
				}
				if (warp_size == 64) {
				#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \
				num_threads(_XTEAM_NUM_THREADS) map(tofrom:retval) map(to:lc2)
				for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
				T val2 = lc2.rnv;
				_BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset)
				val2 = (c[i] < val2) ? c[i] : val2;
				_MIN_OVERLOAD_64_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k,
				_XTEAM_NUM_TEAMS);
				}
				} else {
				#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \
				num_threads(_XTEAM_NUM_THREADS) map(tofrom:retval) map(to:lc2)
				for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
				T val2 = lc2.rnv;
				_BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc2.size, lc2.stride, lc2.offset)
				val2 = (c[i] < val2) ? c[i] : val2;
				_LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS)
				_MIN_OVERLOAD_32_FCT(val2, &retval, lc2.team_vals, lc2.td_ptr, lc2.rnv, k,
				_XTEAM_NUM_TEAMS);
				}
				}
				return retval;
				}

				template <typename T, bool DATA_TYPE_IS_INT>
				void _check_val(T computed_val, T gold_val, const char *msg) {
				double ETOL = 0.0000001;
				if (DATA_TYPE_IS_INT) {
				if (computed_val != gold_val) {
				std::cerr << msg << " FAIL "
				<< "Integar Value was " << computed_val << " but should be "
				<< gold_val << std::endl;
				test_run_rc = 1;
				}
				} else {
				double dcomputed_val = (double)computed_val;
				double dvalgold = (double)gold_val;
				double ompErrSum = abs((dcomputed_val - dvalgold) / dvalgold);
				if (ompErrSum > ETOL) {
				std::cerr << msg << " FAIL " << ompErrSum << " tol:" << ETOL << std::endl
				<< std::setprecision(15) << "Value was " << computed_val
				<< " but should be " << gold_val << std::endl;
				test_run_rc = 1;
				}
				}
				}

				#define ALIGNMENT (2 * 1024 * 1024)

				template <typename T, bool DATA_TYPE_IS_INT>
				void run_tests(uint64_t array_size) {

				// FIXME: How do we get warpsize of a device from host?
				int warp_size = 64;
				#pragma omp target map(tofrom : warp_size)
				warp_size = __kmpc_get_warp_size();

				// Align on 2M boundaries
				T a = (T )aligned_alloc(ALIGNMENT, sizeof(T) * array_size);
				T b = (T )aligned_alloc(ALIGNMENT, sizeof(T) * array_size);
				T c = (T )aligned_alloc(ALIGNMENT, sizeof(T) * array_size);
				#pragma omp target enter data map(alloc:a[0:array_size], b[0:array_size], \
				c[0:array_size])
				#pragma omp target teams distribute parallel for
				for (int64_t i = 0; i < array_size; i++) {
				a[i] = 2;
				b[i] = 3;
				c[i] = (i + 1);
				}

				std::cout << "Running kernels " << repeat_num_times << " times" << std::endl;
				std::cout << "Ignoring timing of first " << ignore_times << " runs "
				<< std::endl;

				double ETOL = 0.0000001;
				if (DATA_TYPE_IS_INT) {
				std::cout << "Integer Size: " << sizeof(T) << std::endl;
				} else {
				if (sizeof(T) == sizeof(float))
				std::cout << "Precision: float" << std::endl;
				else
				std::cout << "Precision: double" << std::endl;
				}

				std::cout << "Warp size:" << warp_size << std::endl;
				// int num_teams = ompx_get_device_num_units(omp_get_default_device());
				int num_teams = _XTEAM_NUM_TEAMS;
				std::cout << "Array elements: " << array_size << std::endl;
				std::cout << "Array size: " << ((array_size * sizeof(T)) / (1024 * 1024))
				<< " MB" << std::endl;

				T goldDot = (T)6 * (T)array_size;
				T goldMax = (T)array_size;
				T goldMin = (T)1;

				double goldDot_d = (double)goldDot;
				double goldMax_d = (double)goldMax;
				double goldMin_d = (double)goldMin;

				// List of times
				std::vector<std::vector<double>> timings(6);

				// Declare timers
				std::chrono::high_resolution_clock::time_point t1, t2;

				// Timing loop
				for (unsigned int k = 0; k < repeat_num_times; k++) {
				t1 = std::chrono::high_resolution_clock::now();
				T omp_sum = omp_dot<T>(a, b, array_size);
				t2 = std::chrono::high_resolution_clock::now();
				timings[0].push_back(
				std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
				.count());
				_check_val<T, DATA_TYPE_IS_INT>(omp_sum, goldDot, "omp_dot");

				t1 = std::chrono::high_resolution_clock::now();
				T sim_sum = sim_dot<T>(a, b, warp_size);
				t2 = std::chrono::high_resolution_clock::now();
				timings[1].push_back(
				std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
				.count());
				_check_val<T, DATA_TYPE_IS_INT>(sim_sum, goldDot, "sim_dot");

				t1 = std::chrono::high_resolution_clock::now();
				T omp_max_val = omp_max<T>(c, array_size);
				t2 = std::chrono::high_resolution_clock::now();
				timings[2].push_back(
				std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
				.count());
				_check_val<T, DATA_TYPE_IS_INT>(omp_max_val, goldMax, "omp_max");

				t1 = std::chrono::high_resolution_clock::now();
				T sim_max_val = sim_max<T>(c, warp_size);
				t2 = std::chrono::high_resolution_clock::now();
				timings[3].push_back(
				std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
				.count());
				_check_val<T, DATA_TYPE_IS_INT>(sim_max_val, goldMax, "sim_max");

				t1 = std::chrono::high_resolution_clock::now();
				T omp_min_val = omp_min<T>(c, array_size);
				t2 = std::chrono::high_resolution_clock::now();
				timings[4].push_back(
				std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
				.count());
				_check_val<T, DATA_TYPE_IS_INT>(omp_min_val, goldMin, "omp_min");

				t1 = std::chrono::high_resolution_clock::now();
				T sim_min_val = sim_min<T>(c, warp_size);
				t2 = std::chrono::high_resolution_clock::now();
				timings[5].push_back(
				std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
				.count());
				_check_val<T, DATA_TYPE_IS_INT>(sim_min_val, goldMin, "sim_min");

				} // end Timing loop

				// Display timing results
				std::cout << std::left << std::setw(12) << "Function" << std::left
				<< std::setw(12) << "Best-MB/sec" << std::left << std::setw(12)
				<< " Min (sec)" << std::left << std::setw(12) << " Max"
				<< std::left << std::setw(12) << "Average" << std::left
				<< std::setw(12) << "Avg-MB/sec" << std::endl;

				std::cout << std::fixed;

				std::string labels[6] = {"ompdot", "simdot", "ompmax",
				"simmax", "ompmin", "simmin"};
				size_t sizes[6] = {2 * sizeof(T) * array_size, 2 * sizeof(T) * array_size,
				1 * sizeof(T) * array_size, 1 * sizeof(T) * array_size,
				1 * sizeof(T) * array_size, 1 * sizeof(T) * array_size};

				for (int i = 0; i < 6; i++) {
				// Get min/max; ignore the first couple results
				auto minmax = std::minmax_element(timings[i].begin() + ignore_times,
				timings[i].end());
				// Calculate average; ignore ignore_times
				double average = std::accumulate(timings[i].begin() + ignore_times,
				timings[i].end(), 0.0) /
				(double)(repeat_num_times - ignore_times);
				printf(" %s %8.0f %8.6f %8.6f %8.6f %8.0f\n",
				labels[i].c_str(), 1.0E-6 * sizes[i] / (*minmax.first),
				(double)minmax.first, (double)minmax.second, (double)average,
				1.0E-6 * sizes[i] / (average));
				}
				#pragma omp target exit data map(release: a[0:array_size], b[0:array_size], \
				c[0:array_size])
				free(a);
				free(b);
				free(c);
				}


				template <typename TC, typename T>
				void _check_val_complex(TC computed_val_complex, TC gold_val_complex,
				const char *msg) {
				T gold_val_r = __real__(gold_val_complex);
				T computed_val_r = __real__(computed_val_complex);
				T gold_val_i = __imag__(gold_val_complex);
				T computed_val_i = __imag__(computed_val_complex);
				double ETOL = 0.0000001;
				double computed_val_r_d = (double)computed_val_r;
				double valgold_r_d = (double)gold_val_r;
				double ompErrSum_r = abs((computed_val_r_d - valgold_r_d) / valgold_r_d);
				double computed_val_i_d = (double)computed_val_i;
				double valgold_i_d = (double)gold_val_i;
				double ompErrSum_i = abs((computed_val_i_d - valgold_i_d) / valgold_i_d);
				if ((ompErrSum_r > ETOL) \|\| (ompErrSum_i > ETOL)) {
				std::cerr << msg << " FAIL " << ompErrSum_r << " tol:" << ETOL << std::endl
				<< std::setprecision(15) << "Value was (" << computed_val_r
				<< " + " << computed_val_i << " i )" << std::endl
				<< " but should be (" << gold_val_r << " + " << gold_val_i
				<< "i) " << std::endl;
				test_run_rc = 1;
				}
				}

				template <typename TC> TC omp_dot_complex(TC a, TC b, uint64_t array_size) {
				TC dot;
				__real__(dot) = 0.0;
				__imag__(dot) = 0.0;
				#pragma omp target teams distribute parallel for map(tofrom: dot) reduction(+:dot)
				for (int64_t i = 0; i < array_size; i++)
				dot += a[i] * b[i];
				return dot;
				}

				template <typename T> T sim_dot_complex(T a, T b, int warp_size) {
				int devid = 0;
				T zero_c;
				__real__(zero_c) = 0.0;
				__imag__(zero_c) = 0.0;
				struct loop_ctl_t {
				uint32_t *td_ptr; // Atomic counter accessed on device
				uint32_t reserved; // reserved
				const int64_t stride = 1; // stride to process input vectors
				const int64_t offset = 0; // Offset to initial index of input vectors
				const int64_t size = _ARRAY_SIZE; // Size of input vectors
				T rnv; // reduction null value
				T *team_vals; // array of global team values
				};
				T sum = zero_c;
				uint32_t zero = 0;
				static loop_ctl_t lc3;
				static int64_t num_teams3 = 0;
				if (!num_teams3) {
				// num_teams3 = ompx_get_device_num_units(devid);
				num_teams3 = _XTEAM_NUM_TEAMS;
				lc3.td_ptr = (uint32_t *)omp_target_alloc(sizeof(uint32_t), devid);
				lc3.team_vals = (T )omp_target_alloc(sizeof(T) num_teams3, devid);
				lc3.rnv = zero_c;
				omp_target_memcpy(lc3.td_ptr, &zero, sizeof(uint32_t), 0, 0, devid,
				omp_get_initial_device());
				}

				if (warp_size == 64) {
				#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \
				num_threads(_XTEAM_NUM_THREADS) map(tofrom \
				: sum) map(to \
				: lc3)
				for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
				T val3 = lc3.rnv;
				_BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc3.size, lc3.stride, lc3.offset)
				val3 += a[i] * b[i];
				_SUM_OVERLOAD_64_FCT(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k,
				_XTEAM_NUM_TEAMS);
				}
				} else {
				#pragma omp target teams distribute parallel for num_teams(_XTEAM_NUM_TEAMS) \
				num_threads(_XTEAM_NUM_THREADS) map(tofrom \
				: sum) map(to \
				: lc3)
				for (uint64_t k = 0; k < (_XTEAM_NUM_TEAMS * _XTEAM_NUM_THREADS); k++) {
				T val3 = lc3.rnv;
				_BIG_JUMP_LOOP(_XTEAM_NUM_TEAMS, lc3.size, lc3.stride, lc3.offset)
				val3 += a[i] * b[i];
				_LIMIT_JUMP_TO_CUDA_REDUCED_THREADS(_XTEAM_NUM_TEAMS)
				_SUM_OVERLOAD_32_FCT(val3, &sum, lc3.team_vals, lc3.td_ptr, lc3.rnv, k,
				_XTEAM_NUM_TEAMS);
				}
				}
				return sum;
				}

				template <typename TC, typename T>
				void run_tests_complex(const uint64_t array_size) {

				// FIXME: How do we get warpsize of a device from host?
				int warp_size = 64;
				#pragma omp target map(tofrom : warp_size)
				warp_size = __kmpc_get_warp_size();

				TC a = (TC )aligned_alloc(ALIGNMENT, sizeof(TC) * array_size);
				TC b = (TC )aligned_alloc(ALIGNMENT, sizeof(TC) * array_size);

				#pragma omp target enter data map(alloc : a [0:array_size], b [0:array_size])
				TC startA;
				__real__(startA) = 1.0;
				__imag__(startA) = 1.0;
				TC startB;
				__real__(startB) = 1.0;
				__imag__(startB) = -1.0;

				#pragma omp target teams distribute parallel for
				for (int64_t i = 0; i < array_size; i++) {
				a[i] = startA;
				b[i] = startB;
				// a[i] * b[i] = 2 + 0i
				}

				std::cout << "Running kernels " << repeat_num_times << " times" << std::endl;
				std::cout << "Ignoring timing of first " << ignore_times << " runs "
				<< std::endl;

				double ETOL = 0.0000001;
				if (sizeof(TC) == sizeof(float _Complex))
				std::cout << "Precision: float _Complex" << std::endl;
				else
				std::cout << "Precision: double _Complex" << std::endl;

				std::cout << "Warp size:" << warp_size << std::endl;
				std::cout << "Array elements: " << array_size << std::endl;
				std::cout << "Array size: " << ((array_size * sizeof(TC)) / (1024 * 1024))
				<< " MB" << std::endl;

				T goldDotr = T(2) * (T)array_size;
				T goldDoti = T(0);

				TC goldDot;
				__real__(goldDot) = goldDotr;
				__imag__(goldDot) = goldDoti;

				// List of times
				std::vector<std::vector<double>> timings(2);

				// Declare timers
				std::chrono::high_resolution_clock::time_point t1, t2;

				// timing loop
				for (unsigned int k = 0; k < repeat_num_times; k++) {
				t1 = std::chrono::high_resolution_clock::now();
				TC omp_sum = omp_dot_complex<TC>(a, b, array_size);
				t2 = std::chrono::high_resolution_clock::now();
				timings[0].push_back(
				std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
				.count());
				_check_val_complex<TC, T>(omp_sum, goldDot, "omp_dot");

				t1 = std::chrono::high_resolution_clock::now();
				TC sim_sum = sim_dot_complex<TC>(a, b, warp_size);
				t2 = std::chrono::high_resolution_clock::now();
				timings[1].push_back(
				std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1)
				.count());
				_check_val_complex<TC, T>(sim_sum, goldDot, "sim_dot");

				} // end timing loop

				// Display timing results
				std::cout << std::left << std::setw(12) << "Function" << std::left
				<< std::setw(12) << "Best-MB/sec" << std::left << std::setw(12)
				<< " Min (sec)" << std::left << std::setw(12) << " Max"
				<< std::left << std::setw(12) << "Average" << std::left
				<< std::setw(12) << "Avg-MB/sec" << std::endl;

				std::cout << std::fixed;

				std::string labels[2] = {"ompdot", "simdot"};
				size_t sizes[2] = {2 * sizeof(TC) * array_size, 2 * sizeof(TC) * array_size};

				for (int i = 0; i < 2; i++) {
				// Get min/max; ignore the first couple results
				auto minmax = std::minmax_element(timings[i].begin() + ignore_times,
				timings[i].end());

				// Calculate average; ignore ignore_times
				double average = std::accumulate(timings[i].begin() + ignore_times,
				timings[i].end(), 0.0) /
				(double)(repeat_num_times - ignore_times);

				printf(" %s %8.0f %8.6f %8.6f %8.6f %8.0f\n",
				labels[i].c_str(), 1.0E-6 * sizes[i] / (*minmax.first),
				(double)minmax.first, (double)minmax.second, (double)average,
				1.0E-6 * sizes[i] / (average));
				}
				#pragma omp target exit data map(release : a [0:array_size], b [0:array_size])
				free(a);
				free(b);
				}

openmp/libomptarget/test/xteamr/test_xteamr.sh

This file was added.

Property	Old Value	New Value
File Mode	null	100755

				#!/bin/bash
				#== overload_insts_1024.h overloaded instatiations of Xteamr fcts -C++ -*-===//
				#
				# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				# See https://llvm.org/LICENSE.txt for license information.
				# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				#
				#===----------------------------------------------------------------------===//
				#
				# test_xteamr.sh: Script to test high performance reduction helper functions
				# in llvm-project/openmp/libomptarget/DeviceRTL/src/Xteamr.cpp
				# It compiles and executes test_xteamr.cpp in 3 configs.
				# 1024 device threads, 512 dev threads, and 256 dev threads.
				#
				# See README file in this directory for more information.
				#
				#===----------------------------------------------------------------------===//

				LLVM_INSTALL=${LLVM_INSTALL:-$HOME/llvm}
				[ ! -f $LLVM_INSTALL/bin/clang ] && echo "ERROR: no LLVM install at $LLVM_INSTALL" && exit 1

				OFFLOAD_ARCH=${OFFLOAD_ARCH:-sm_70}

				tmpdir=/tmp/$USER/xteamr && mkdir -p $tmpdir
				[ ! -d $tmpdir ] && echo "ERROR: could not create $tmpdir"

				ARRAY_SIZE=${ARRAY_SIZE:-41943040}
				#ARRAY_SIZE=${ARRAY_SIZE:-33554432}
				as_arg="-D_ARRAY_SIZE=$ARRAY_SIZE"

				NUM_TEAMS=${NUM_TEAMS:-80}

				cuda_args=""
				CUDA_INSTALL=${CUDA_INSTALL:-/usr/local/cuda}
				[ -d $CUDA_INSTALL ] && cudalib=$CUDA_INSTALL/targets/x86_64-linux/lib && export LD_LIBRARY_PATH=$cudalib && cuda_args="-L$cudalib -lcudart"

				nt_args="-D_XTEAM_NUM_THREADS=1024 -D_XTEAM_NUM_TEAMS=$NUM_TEAMS"
				echo " COMPILE with --offload-arch=$OFFLOAD_ARCH $as_arg $nt_args"
				$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg $nt_args -fopenmp --offload-arch=$OFFLOAD_ARCH test_xteamr.cpp -o $tmpdir/xteamr_1024 $cuda_args -lstdc++ -latomic
				rc1=$?

				nt_args="-D_XTEAM_NUM_THREADS=512 -D_XTEAM_NUM_TEAMS=$NUM_TEAMS"
				echo " COMPILE with --offload-arch=$OFFLOAD_ARCH $as_arg $nt_args"
				$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg $nt_args -fopenmp --offload-arch=$OFFLOAD_ARCH test_xteamr.cpp -o $tmpdir/xteamr_512 $cuda_args -lstdc++ -latomic
				rc2=$?

				nt_args="-D_XTEAM_NUM_THREADS=256 -D_XTEAM_NUM_TEAMS=$NUM_TEAMS"
				echo " COMPILE with --offload-arch=$OFFLOAD_ARCH $as_arg $nt_args"
				$LLVM_INSTALL/bin/clang++ -O3 -I. $as_arg $nt_args -fopenmp --offload-arch=$OFFLOAD_ARCH test_xteamr.cpp -o $tmpdir/xteamr_256 $cuda_args -lstdc++ -latomic
				rc3=$?

				[ $rc1 == 0 ] && echo "START EXECUTE xteamr_1024" && $tmpdir/xteamr_1024 > $tmpdir/xteamr_1024.out
				rc4=$?
				[ $rc2 == 0 ] && echo "START EXECUTE xteamr_512" && $tmpdir/xteamr_512 > $tmpdir/xteamr_512.out
				rc5=$?
				[ $rc3 == 0 ] && echo "START EXECUTE xteamr_256" && $tmpdir/xteamr_256 > $tmpdir/xteamr_256.out
				rc6=$?

				echo
				rc=$(( $rc1 + $rc2 + $rc3 + $rc4 + $rc5 + $rc6 ))
				if [ $rc != 0 ] ; then
				echo "ERRORS DETECTED!"
				else
				echo "No errors detected"
				fi
				echo "Logs and binaries saved to $tmpdir"
				exit $rc

This is an archive of the discontinued LLVM Phabricator instance.

[OPENMP] Fast cross-team reduction (xteamr) helper functions.
Needs ReviewPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 471196

openmp/libomptarget/DeviceRTL/CMakeLists.txt

openmp/libomptarget/DeviceRTL/include/Interface.h

openmp/libomptarget/DeviceRTL/include/Xteamr.h

openmp/libomptarget/DeviceRTL/src/Xteamr.cpp

openmp/libomptarget/test/lit.cfg

openmp/libomptarget/test/xteamr/test_xteamr.h

openmp/libomptarget/test/xteamr/test_xteamr.cpp

openmp/libomptarget/test/xteamr/test_xteamr.sh

This is an archive of the discontinued LLVM Phabricator instance.

[OPENMP] Fast cross-team reduction (xteamr) helper functions.Needs ReviewPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 471196

openmp/libomptarget/DeviceRTL/CMakeLists.txt

openmp/libomptarget/DeviceRTL/include/Interface.h

openmp/libomptarget/DeviceRTL/include/Xteamr.h

openmp/libomptarget/DeviceRTL/src/Xteamr.cpp

openmp/libomptarget/test/lit.cfg

openmp/libomptarget/test/xteamr/test_xteamr.h

openmp/libomptarget/test/xteamr/test_xteamr.cpp

openmp/libomptarget/test/xteamr/test_xteamr.sh

[OPENMP] Fast cross-team reduction (xteamr) helper functions.
Needs ReviewPublic