diff --git a/openmp/libomptarget/deviceRTLs/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/CMakeLists.txt @@ -10,4 +10,5 @@ # ##===----------------------------------------------------------------------===## +add_subdirectory(common) add_subdirectory(nvptx) diff --git a/openmp/libomptarget/deviceRTLs/common/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/common/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/CMakeLists.txt @@ -0,0 +1,208 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build the Common Device RTL parts. +# +##===----------------------------------------------------------------------===## + +set(LIBOMPTARGET_DEVICE_RTL_COMPILER "" CACHE STRING + "Path to alternate compiler to be used for the device RTL.") +set(LIBOMPTARGET_DEVICE_RTL_LINKER "" CACHE STRING + "Path to alternate compiler to be used for the device RTL.") + +if(LIBOMPTARGET_DEVICE_RTL_COMPILER) + find_program(ALTERNATE_DEVICE_RTL_COMPILER NAMES ${LIBOMPTARGET_DEVICE_RTL_COMPILER}) + if(NOT ALTERNATE_DEVICE_RTL_COMPILER) + libomptarget_say("Not building offloading device RTL: + LIBOMPTARGET_DEVICE_RTL_COMPILER not valid, was '${LIBOMPTARGET_DEVICE_RTL_LINKER}'") + endif() + set(LIBOMPTARGET_DEVICE_RTL_COMPILER ${ALTERNATE_DEVICE_RTL_COMPILER} CACHE FILEPATH "" FORCE) +else() + set(LIBOMPTARGET_DEVICE_RTL_COMPILER ${CMAKE_CXX_COMPILER} CACHE FILEPATH "" FORCE) +endif() + +if(NOT LIBOMPTARGET_DEVICE_RTL_LINKER) + get_filename_component(compiler_base_directory ${LIBOMPTARGET_DEVICE_RTL_COMPILER} DIRECTORY) + set(LIBOMPTARGET_DEVICE_RTL_LINKER "${compiler_base_directory}/llvm-link" CACHE FILEPATH "" FORCE) +endif() + +if(LIBOMPTARGET_DEVICE_RTL_LINKER) + find_program(ALTERNATE_DEVICE_RTL_LINKER NAMES ${LIBOMPTARGET_DEVICE_RTL_LINKER}) + if(NOT ALTERNATE_DEVICE_RTL_LINKER) + libomptarget_say("Not building offloading device RTL: + LIBOMPTARGET_DEVICE_RTL_LINKER not valid, was '${LIBOMPTARGET_DEVICE_RTL_LINKER}'") + endif() + set(LIBOMPTARGET_DEVICE_RTL_LINKER ${ALTERNATE_DEVICE_RTL_LINKER} CACHE FILEPATH "" FORCE) +endif() + +if(LIBOMPTARGET_DEVICE_RTL_COMPILER AND LIBOMPTARGET_DEVICE_RTL_LINKER) + + get_filename_component(devicertl_base_directory + ${CMAKE_CURRENT_SOURCE_DIR} + DIRECTORY) + + set(src_files + src/cancel.cu + #src/loop.cu + ) + + + libomptarget_say("Building LLVM bitcode offloading device RTL.") + + # Set flags for LLVM Bitcode compilation. + set(bc_flags -S + -x c++ + -fopenmp + -Xclang -fopenmp-is-device + -Xclang -emit-llvm-bc + -I${devicertl_base_directory} + -I${CMAKE_CURRENT_SOURCE_DIR} + -I${CMAKE_CURRENT_SOURCE_DIR}/include + ) + + # Activate RTL message dumps if requested by the user. + set(LIBOMPTARGET_DEBUG FALSE CACHE BOOL + "Activate NVPTX device RTL debug messages.") + + if(${LIBOMPTARGET_DEBUG}) + set(bc_flags ${bc_flags} -DOMPTARGET_DEBUG=-1) + else() + set(bc_flags ${bc_flags} -DOMPTARGET_DEBUG=0) + endif() + + # Create target to build all Bitcode libraries. + add_custom_target(omptarget-device-rtl) + + set(names "") + set(triples "") + set(target_cpus "") + set(target_features "") + set(aux_triples "") + + # NVIDIA configurations + + # Generate a Bitcode library for all the compute capabilities the user requested. + set(nvptx_ptx_list 50 60 61 62 63 64 65 70 71) + set(nvptx_triple_list "nvptx" "nvptx64") + #execute_process(COMMAND ${LIBOMPTARGET_DEVICE_RTL_COMPILER} -v | grep 'target' OUTPUT_VARIABLE aux_triple) + set(nvptx_aux_triples "x86_64-unknown-linux-gnu") + + # Build library support for the highest compute capability the system supports + # and always build support for sm_35 by default + if (${LIBOMPTARGET_DEP_CUDA_ARCH} EQUAL 35) + set(default_capabilities 35) + else() + set(default_capabilities "35,${LIBOMPTARGET_DEP_CUDA_ARCH}") + endif() + + if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY) + set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY}) + libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES") + endif() + set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING + "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.") + string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}) + + + foreach(ptx ${nvptx_ptx_list}) + foreach(triple ${nvptx_triple_list}) + foreach(sm ${nvptx_sm_list}) + foreach(aux_triple ${nvptx_aux_triples}) + list(APPEND names "${triple}-sm${sm}-ptx${ptx}") + list(APPEND triples ${triple}) + list(APPEND target_cpus sm_${sm}) + list(APPEND target_features +ptx${ptx}) + list(APPEND aux_triples ${aux_triple}) + endforeach() + endforeach() + endforeach() + endforeach() + + # - NVIDIA + + # Host + + set(host_triple "x86_64-unknown-linux-gnu") + list(APPEND names "${host_triple}") + list(APPEND triples ${host_triple}) + list(APPEND target_cpus "") + list(APPEND target_features "") + list(APPEND aux_triples "") + + # - Host + + + foreach(cname triple target_cpu target_feature aux_triple IN ZIP_LISTS names triples target_cpus target_features aux_triples) + # Compile CUDA files to bitcode. + libomptarget_say("${cname} --- ${triple} --- ${target_cpu}") + set(bc_files "") + foreach(src ${src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + + set(config_flags "") + if(${triple}) + list(APPEND config_flags -target=${triple}) + endif() + if(${target_cpu}) + list(APPEND config_flags -mcpu=${target_cpu}) + endif() + if(${target_feature}) + list(APPEND config_flags -Xclang) + list(APPEND config_flags -target-feature) + list(APPEND config_flags -Xclang) + list(APPEND config_flags ${target_feature}) + endif() + if(${aux_triple}) + list(APPEND config_flags -Xclang) + list(APPEND config_flags -aux-triple) + list(APPEND config_flags -Xclang) + list(APPEND config_flags ${aux_triple}) + endif() + + add_custom_command(OUTPUT ${outfile}-${cname}.bc + COMMAND ${LIBOMPTARGET_DEVICE_RTL_COMPILER} ${bc_flags} + ${config_flags} + ${} ${infile} -o ${outfile}-${cname}.bc + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}-${cname}.bc" + VERBATIM + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES + ${outfile}-${cname}.bc) + + list(APPEND bc_files ${outfile}-${cname}.bc) + endforeach() + + # Link to a bitcode library. + add_custom_command(OUTPUT + ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-device-rtl-${cname}.bc + COMMAND ${LIBOMPTARGET_DEVICE_RTL_LINKER} + -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-device-rtl-${cname}.bc ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode libomptarget-device-rtl-${cname}.bc" + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES + libomptarget-device-rtl-${cname}.bc) + + add_custom_target(omptarget-device-rtl-${cname}-bc ALL DEPENDS + ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-device-rtl-${cname}.bc) + add_dependencies(omptarget-device-rtl omptarget-device-rtl-${cname}-bc) + + # Copy library to destination. + add_custom_command(TARGET omptarget-device-rtl-${cname}-bc POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-device-rtl-${cname}.bc + ${OPENMP_INSTALL_LIBDIR}) + + # Install bitcode library under the lib destination folder. + install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-device-rtl-${cname}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}") + endforeach() +endif() diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h --- a/openmp/libomptarget/deviceRTLs/common/debug.h +++ b/openmp/libomptarget/deviceRTLs/common/debug.h @@ -126,7 +126,7 @@ // implementation for debug //////////////////////////////////////////////////////////////////////////////// -#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING +#if OMPTARGET_NVPTX_DEBUG #include "common/support.h" template @@ -135,9 +135,6 @@ (int)GetWarpId(), (int)GetLaneId(), parameters...); } -#endif -#if OMPTARGET_NVPTX_TEST - template NOINLINE static void check(bool cond, const char *fmt, Arguments... parameters) { @@ -211,6 +208,7 @@ #define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY) #define LT_FUSSY (LT_SET_FUSSY) +#if OMPTARGET_NVPTX_DEBUG #if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY #define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) @@ -244,6 +242,7 @@ } \ } +#endif #else #define TON(_flag) (0) @@ -260,7 +259,7 @@ #define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT) #define LW_FUSSY (LW_SET_FUSSY) -#if OMPTARGET_NVPTX_WARNING +#if OMPTARGET_NVPTX_DEBUG #define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag)) #define WARNING0(_flag, _str) \ diff --git a/openmp/libomptarget/deviceRTLs/common/device_environment.h b/openmp/libomptarget/deviceRTLs/common/device_environment.h --- a/openmp/libomptarget/deviceRTLs/common/device_environment.h +++ b/openmp/libomptarget/deviceRTLs/common/device_environment.h @@ -13,12 +13,12 @@ #ifndef _OMPTARGET_DEVICE_ENVIRONMENT_H_ #define _OMPTARGET_DEVICE_ENVIRONMENT_H_ -#include "target_impl.h" +#include struct omptarget_device_environmentTy { int32_t debug_level; }; -extern DEVICE omptarget_device_environmentTy omptarget_device_environment; +extern omptarget_device_environmentTy omptarget_device_environment; #endif diff --git a/openmp/libomptarget/deviceRTLs/common/include/target_impl.h b/openmp/libomptarget/deviceRTLs/common/include/target_impl.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/include/target_impl.h @@ -0,0 +1,94 @@ +//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Definitions of target specific functions +// +//===----------------------------------------------------------------------===// +#ifndef _TARGET_IMPL_H_ +#define _TARGET_IMPL_H_ + +#include +#include +#include +#include + +#define DEVICE __device__ +#define INLINE __forceinline__ DEVICE +#define NOINLINE __noinline__ DEVICE +#define SHARED __shared__ + +//////////////////////////////////////////////////////////////////////////////// +// Kernel options +//////////////////////////////////////////////////////////////////////////////// + +void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi); + +uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi); + +// enum : __kmpc_impl_lanemask_t; +//__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 +//}; + +//__kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); + +//__kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); + +uint32_t __kmpc_impl_smid(); + +double __kmpc_impl_get_wtick(); + +double __kmpc_impl_get_wtime(); + +uint32_t __kmpc_impl_ffs(uint32_t x); + +uint32_t __kmpc_impl_popc(uint32_t x); + +// template T __kmpc_impl_min(T x, T y); +// return min(x, y); +//} + +//__kmpc_impl_lanemask_t __kmpc_impl_activemask() ; + +// int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, +// int32_t SrcLane); + +// int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, +// int32_t Var, uint32_t Delta, +// int32_t Width); + +void __kmpc_impl_syncthreads(); + +// void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask); + +void __kmpc_impl_target_init(); +void __kmpc_impl_named_sync(uint32_t num_threads); +void __kmpc_impl_threadfence(void); +void __kmpc_impl_threadfence_block(void); +void __kmpc_impl_threadfence_system(void); + +// Calls to the NVPTX layer (assuming 1D layout) +int GetThreadIdInBlock(); +int GetBlockIdInKernel(); +int GetNumberOfBlocksInKernel(); +int GetNumberOfThreadsInBlock(); +unsigned GetWarpId(); +unsigned GetLaneId(); + +// Locks +struct omp_lock_t; +void __kmpc_impl_init_lock(omp_lock_t *lock); +void __kmpc_impl_destroy_lock(omp_lock_t *lock); +void __kmpc_impl_set_lock(omp_lock_t *lock); +void __kmpc_impl_unset_lock(omp_lock_t *lock); +int __kmpc_impl_test_lock(omp_lock_t *lock); + +// Memory +void *__kmpc_impl_malloc(size_t x); +void __kmpc_impl_free(void *x); + +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -14,12 +14,15 @@ #ifndef OMPTARGET_H #define OMPTARGET_H -#include "target_impl.h" +//#include "target_impl.h" #include "common/debug.h" // debug #include "interface.h" // interfaces with omp, compiler, and user #include "common/state-queue.h" #include "common/support.h" +#include +#include + #define OMPTARGET_NVPTX_VERSION 1.1 // used by the library for the interface with the app @@ -34,15 +37,17 @@ #define BARRIER_COUNTER 0 #define ORDERED_COUNTER 1 +#define MAX_SHARED_ARGS 20 + // arguments needed for L0 parallelism only. class omptarget_nvptx_SharedArgs { public: // All these methods must be called by the master thread only. - INLINE void Init() { + void Init() { args = buffer; nArgs = MAX_SHARED_ARGS; } - INLINE void DeInit() { + void DeInit() { // Free any memory allocated for outlined parallel function with a large // number of arguments. if (nArgs > MAX_SHARED_ARGS) { @@ -50,7 +55,7 @@ Init(); } } - INLINE void EnsureSize(size_t size) { + void EnsureSize(size_t size) { if (size > nArgs) { if (nArgs > MAX_SHARED_ARGS) { SafeFree(args, "new extended args"); @@ -60,7 +65,8 @@ } } // Called by all threads. - INLINE void **GetArgs() const { return args; }; + void **GetArgs() const { return args; }; + private: // buffer of pre-allocated arguments. void *buffer[MAX_SHARED_ARGS]; @@ -71,8 +77,8 @@ uint32_t nArgs; }; -extern DEVICE SHARED omptarget_nvptx_SharedArgs - omptarget_nvptx_globalArgs; +// SHARED +extern omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; // Worker slot type which is initialized with the default worker slot // size of 4*32 bytes. @@ -94,7 +100,8 @@ __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number]; }; -extern DEVICE SHARED DataSharingStateTy DataSharingState; +// SHARED +extern DataSharingStateTy DataSharingState; //////////////////////////////////////////////////////////////////////////////// // task ICV and (implicit & explicit) task state @@ -102,36 +109,36 @@ class omptarget_nvptx_TaskDescr { public: // methods for flags - INLINE omp_sched_t GetRuntimeSched() const; - INLINE void SetRuntimeSched(omp_sched_t sched); - INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; } - INLINE int InL2OrHigherParallelRegion() const { + omp_sched_t GetRuntimeSched() const; + void SetRuntimeSched(omp_sched_t sched); + int InParallelRegion() const { return items.flags & TaskDescr_InPar; } + int InL2OrHigherParallelRegion() const { return items.flags & TaskDescr_InParL2P; } - INLINE int IsParallelConstruct() const { + int IsParallelConstruct() const { return items.flags & TaskDescr_IsParConstr; } - INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } + int IsTaskConstruct() const { return !IsParallelConstruct(); } // methods for other fields - INLINE uint16_t &ThreadId() { return items.threadId; } - INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } - INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } - INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { + uint16_t &ThreadId() { return items.threadId; } + uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } + omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } + void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { prev = taskDescr; } // init & copy - INLINE void InitLevelZeroTaskDescr(); - INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); - INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); - INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr); - INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); - INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, - uint16_t tid, uint16_t tnum); - INLINE void SaveLoopData(); - INLINE void RestoreLoopData() const; + void InitLevelZeroTaskDescr(); + void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr); + void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); + void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); + void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); + void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); + void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr); + void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); + void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, + uint16_t tid, uint16_t tnum); + void SaveLoopData(); + void RestoreLoopData() const; private: // bits for flags: (6 used, 2 free) @@ -176,7 +183,7 @@ public: // access to data - INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } + omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } private: omptarget_nvptx_TaskDescr masterTaskICV; @@ -187,17 +194,15 @@ class omptarget_nvptx_TeamDescr { public: // access to data - INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { + omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { return &levelZeroTaskDescr; } - INLINE omptarget_nvptx_WorkDescr &WorkDescr() { - return workDescrForActiveParallel; - } + omptarget_nvptx_WorkDescr &WorkDescr() { return workDescrForActiveParallel; } // init - INLINE void InitTeamDescr(); + void InitTeamDescr(); - INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { + __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { worker_rootS[wid].DataEnd = &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; // We currently do not have a next slot. @@ -224,29 +229,26 @@ class omptarget_nvptx_ThreadPrivateContext { public: // task - INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { + omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { return &levelOneTaskDescr[tid]; } - INLINE void SetTopLevelTaskDescr(int tid, - omptarget_nvptx_TaskDescr *taskICV) { + void SetTopLevelTaskDescr(int tid, omptarget_nvptx_TaskDescr *taskICV) { topTaskDescr[tid] = taskICV; } - INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const; + omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const; // parallel - INLINE uint16_t &NumThreadsForNextParallel(int tid) { - return nextRegion.tnum[tid]; - } + uint16_t &NumThreadsForNextParallel(int tid) { return nextRegion.tnum[tid]; } // schedule (for dispatch) - INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } - INLINE int64_t &Chunk(int tid) { return chunk[tid]; } - INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } - INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } - INLINE int64_t &Stride(int tid) { return stride[tid]; } + kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } + int64_t &Chunk(int tid) { return chunk[tid]; } + int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } + int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } + int64_t &Stride(int tid) { return stride[tid]; } - INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } + omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } - INLINE void InitThreadPrivateContext(int tid); - INLINE uint64_t &Cnt() { return cnt; } + void InitThreadPrivateContext(int tid); + uint64_t &Cnt() { return cnt; } private: // team context for this team @@ -277,13 +279,11 @@ volatile unsigned keys[OMP_STATE_COUNT]; } MemData[MAX_SM]; - INLINE static uint32_t hash(unsigned key) { - return key & (OMP_STATE_COUNT - 1); - } + static uint32_t hash(unsigned key) { return key & (OMP_STATE_COUNT - 1); } public: - INLINE void Release(); - INLINE const void *Acquire(const void *buf, size_t size); + void Release(); + const void *Acquire(const void *buf, size_t size); }; //////////////////////////////////////////////////////////////////////////////// @@ -292,20 +292,18 @@ // global data tables //////////////////////////////////////////////////////////////////////////////// -extern DEVICE omptarget_nvptx_SimpleMemoryManager - omptarget_nvptx_simpleMemoryManager; -extern DEVICE SHARED uint32_t usedMemIdx; -extern DEVICE SHARED uint32_t usedSlotIdx; -extern DEVICE SHARED uint8_t - parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -extern DEVICE SHARED uint16_t threadLimit; -extern DEVICE SHARED uint16_t threadsInTeam; -extern DEVICE SHARED uint16_t nThreads; -extern DEVICE SHARED - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; - -extern DEVICE SHARED uint32_t execution_param; -extern DEVICE SHARED void *ReductionScratchpadPtr; +extern omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; +extern SHARED uint32_t usedMemIdx; +extern SHARED uint32_t usedSlotIdx; +extern SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +extern SHARED uint16_t threadLimit; +extern SHARED uint16_t threadsInTeam; +extern SHARED uint16_t nThreads; +extern SHARED omptarget_nvptx_ThreadPrivateContext + *omptarget_nvptx_threadPrivateContext; + +extern SHARED uint32_t execution_param; +extern SHARED void *ReductionScratchpadPtr; //////////////////////////////////////////////////////////////////////////////// // work function (outlined parallel/simd functions) and arguments. @@ -313,18 +311,16 @@ //////////////////////////////////////////////////////////////////////////////// typedef void *omptarget_nvptx_WorkFn; -extern volatile DEVICE SHARED omptarget_nvptx_WorkFn - omptarget_nvptx_workFn; +extern volatile SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn; //////////////////////////////////////////////////////////////////////////////// // get private data structures //////////////////////////////////////////////////////////////////////////////// -INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); -INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); -INLINE omptarget_nvptx_TaskDescr * -getMyTopTaskDescriptor(bool isSPMDExecutionMode); -INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); +omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); +omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); +omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(bool isSPMDExecutionMode); +omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); //////////////////////////////////////////////////////////////////////////////// // inlined implementation diff --git a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu @@ -10,19 +10,26 @@ // //===----------------------------------------------------------------------===// -#include "interface.h" #include "common/debug.h" -EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, +#include + +struct kmp_Ident; + +#pragma omp declare target + +int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, int32_t cancelVal) { PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal); // disabled return 0; } -EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, +int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, int32_t cancelVal) { PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal); // disabled return 0; -} +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -13,8 +13,12 @@ //===----------------------------------------------------------------------===// #include "common/omptarget.h" -#include "target_impl.h" -#include "common/target_atomic.h" +//#include "target_impl.h" +//#include "common/target_atomic.h" +#include "interface.h" +#include "debug.h" +#include "support.h" +#include //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// @@ -47,7 +51,7 @@ */ // helper function for static chunk - INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, + static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk, T entityId, T numberOfEntities) { // each thread executes multiple chunks all of the same size, except // the last one @@ -68,7 +72,7 @@ // Loop with static scheduling without chunk // helper function for static no chunk - INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, + static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk, T entityId, T numberOfEntities) { // No chunk size specified. Each thread or warp gets at most one @@ -94,7 +98,7 @@ //////////////////////////////////////////////////////////////////////////////// // Support for Static Init - INLINE static void for_static_init(int32_t gtid, int32_t schedtype, + static void for_static_init(int32_t gtid, int32_t schedtype, int32_t *plastiter, T *plower, T *pupper, ST *pstride, ST chunk, bool IsSPMDExecutionMode) { @@ -195,12 +199,12 @@ //////////////////////////////////////////////////////////////////////////////// // Support for dispatch Init - INLINE static int OrderedSchedule(kmp_sched_t schedule) { + static int OrderedSchedule(kmp_sched_t schedule) { return schedule >= kmp_sched_ordered_first && schedule <= kmp_sched_ordered_last; } - INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId, + static void dispatch_init(kmp_Ident *loc, int32_t threadId, kmp_sched_t schedule, T lb, T ub, ST st, ST chunk) { if (checkRuntimeUninitialized(loc)) { @@ -381,7 +385,7 @@ //////////////////////////////////////////////////////////////////////////////// // Support for dispatch next - INLINE static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val, + static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val, int leader) { uint32_t lo, hi; __kmpc_impl_unpack(val, lo, hi); @@ -390,7 +394,7 @@ return __kmpc_impl_pack(lo, hi); } - INLINE static uint64_t NextIter() { + static uint64_t NextIter() { __kmpc_impl_lanemask_t active = __kmpc_impl_activemask(); uint32_t leader = __kmpc_impl_ffs(active) - 1; uint32_t change = __kmpc_impl_popc(active); @@ -406,7 +410,7 @@ return warp_res + rank; } - INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize, + static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound, T loopUpperBound) { T N = NextIter(); lb = loopLowerBound + N * chunkSize; @@ -438,7 +442,7 @@ return FINISHED; } - INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast, + static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast, T *plower, T *pupper, ST *pstride) { if (checkRuntimeUninitialized(loc)) { // In SPMD mode no need to check parallelism level - dynamic scheduling @@ -513,7 +517,7 @@ return DISPATCH_NOTFINISHED; } - INLINE static void dispatch_fini() { + static void dispatch_fini() { // nothing } @@ -527,7 +531,7 @@ //////////////////////////////////////////////////////////////////////////////// // init -EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid, +void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid, int32_t schedule, int32_t lb, int32_t ub, int32_t st, int32_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); @@ -535,7 +539,7 @@ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } -EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid, +void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid, int32_t schedule, uint32_t lb, uint32_t ub, int32_t st, int32_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); @@ -543,7 +547,7 @@ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } -EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid, +void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid, int32_t schedule, int64_t lb, int64_t ub, int64_t st, int64_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); @@ -551,7 +555,7 @@ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } -EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid, +void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid, int32_t schedule, uint64_t lb, uint64_t ub, int64_t st, int64_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); @@ -560,14 +564,14 @@ } // next -EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last, +int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last, int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); return omptarget_nvptx_LoopSupport::dispatch_next( loc, tid, p_last, p_lb, p_ub, p_st); } -EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, +int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, int32_t *p_last, uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n"); @@ -575,14 +579,14 @@ loc, tid, p_last, p_lb, p_ub, p_st); } -EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last, +int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last, int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_8\n"); return omptarget_nvptx_LoopSupport::dispatch_next( loc, tid, p_last, p_lb, p_ub, p_st); } -EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, +int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, int32_t *p_last, uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n"); @@ -591,22 +595,22 @@ } // fini -EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) { +void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n"); omptarget_nvptx_LoopSupport::dispatch_fini(); } -EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) { +void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n"); omptarget_nvptx_LoopSupport::dispatch_fini(); } -EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) { +void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n"); omptarget_nvptx_LoopSupport::dispatch_fini(); } -EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) { +void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n"); omptarget_nvptx_LoopSupport::dispatch_fini(); } @@ -615,7 +619,7 @@ // KMP interface implementation (static loops) //////////////////////////////////////////////////////////////////////////////// -EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, +void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, @@ -626,7 +630,7 @@ checkSPMDMode(loc)); } -EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, +void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, @@ -637,7 +641,7 @@ checkSPMDMode(loc)); } -EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, +void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, @@ -648,7 +652,7 @@ checkSPMDMode(loc)); } -EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, +void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, @@ -659,7 +663,6 @@ checkSPMDMode(loc)); } -EXTERN void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int32_t *plower, int32_t *pupper, @@ -671,7 +674,6 @@ /*IsSPMDExecutionMode=*/true); } -EXTERN void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint32_t *plower, @@ -683,7 +685,6 @@ /*IsSPMDExecutionMode=*/true); } -EXTERN void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int64_t *plower, int64_t *pupper, @@ -695,7 +696,6 @@ /*IsSPMDExecutionMode=*/true); } -EXTERN void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint64_t *plower, @@ -707,7 +707,6 @@ /*IsSPMDExecutionMode=*/true); } -EXTERN void __kmpc_for_static_init_4_simple_generic( kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, @@ -718,7 +717,6 @@ /*IsSPMDExecutionMode=*/false); } -EXTERN void __kmpc_for_static_init_4u_simple_generic( kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, @@ -729,7 +727,6 @@ /*IsSPMDExecutionMode=*/false); } -EXTERN void __kmpc_for_static_init_8_simple_generic( kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, @@ -740,7 +737,6 @@ /*IsSPMDExecutionMode=*/false); } -EXTERN void __kmpc_for_static_init_8u_simple_generic( kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, @@ -751,6 +747,6 @@ /*IsSPMDExecutionMode=*/false); } -EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) { +void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) { PRINT0(LD_IO, "call kmpc_for_static_fini\n"); } diff --git a/openmp/libomptarget/deviceRTLs/common/state-queue.h b/openmp/libomptarget/deviceRTLs/common/state-queue.h --- a/openmp/libomptarget/deviceRTLs/common/state-queue.h +++ b/openmp/libomptarget/deviceRTLs/common/state-queue.h @@ -21,7 +21,7 @@ #include -#include "target_impl.h" +//#include "target_impl.h" template class omptarget_nvptx_Queue { private: @@ -32,18 +32,18 @@ volatile uint32_t tail; static const uint32_t MAX_ID = (1u << 31) / SIZE / 2; - INLINE uint32_t ENQUEUE_TICKET(); - INLINE uint32_t DEQUEUE_TICKET(); - INLINE static uint32_t ID(uint32_t ticket); - INLINE bool IsServing(uint32_t slot, uint32_t id); - INLINE void PushElement(uint32_t slot, ElementType *element); - INLINE ElementType *PopElement(uint32_t slot); - INLINE void DoneServing(uint32_t slot, uint32_t id); + uint32_t ENQUEUE_TICKET(); + uint32_t DEQUEUE_TICKET(); + static uint32_t ID(uint32_t ticket); + bool IsServing(uint32_t slot, uint32_t id); + void PushElement(uint32_t slot, ElementType *element); + ElementType *PopElement(uint32_t slot); + void DoneServing(uint32_t slot, uint32_t id); public: - INLINE omptarget_nvptx_Queue() {} - INLINE void Enqueue(ElementType *element); - INLINE ElementType *Dequeue(); + omptarget_nvptx_Queue() {} + void Enqueue(ElementType *element); + ElementType *Dequeue(); }; #include "state-queuei.h" diff --git a/openmp/libomptarget/deviceRTLs/common/state-queuei.h b/openmp/libomptarget/deviceRTLs/common/state-queuei.h --- a/openmp/libomptarget/deviceRTLs/common/state-queuei.h +++ b/openmp/libomptarget/deviceRTLs/common/state-queuei.h @@ -20,51 +20,48 @@ #include "common/target_atomic.h" template -INLINE uint32_t omptarget_nvptx_Queue::ENQUEUE_TICKET() { +uint32_t omptarget_nvptx_Queue::ENQUEUE_TICKET() { return __kmpc_atomic_add((unsigned int *)&tail, 1u); } template -INLINE uint32_t omptarget_nvptx_Queue::DEQUEUE_TICKET() { +uint32_t omptarget_nvptx_Queue::DEQUEUE_TICKET() { return __kmpc_atomic_add((unsigned int *)&head, 1u); } template -INLINE uint32_t -omptarget_nvptx_Queue::ID(uint32_t ticket) { +uint32_t omptarget_nvptx_Queue::ID(uint32_t ticket) { return (ticket / SIZE) * 2; } template -INLINE bool omptarget_nvptx_Queue::IsServing(uint32_t slot, - uint32_t id) { +bool omptarget_nvptx_Queue::IsServing(uint32_t slot, + uint32_t id) { return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id; } template -INLINE void -omptarget_nvptx_Queue::PushElement(uint32_t slot, - ElementType *element) { +void omptarget_nvptx_Queue::PushElement( + uint32_t slot, ElementType *element) { __kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot], (unsigned long long)element); } template -INLINE ElementType * +ElementType * omptarget_nvptx_Queue::PopElement(uint32_t slot) { return (ElementType *)__kmpc_atomic_add( (unsigned long long *)&elementQueue[slot], (unsigned long long)0); } template -INLINE void omptarget_nvptx_Queue::DoneServing(uint32_t slot, - uint32_t id) { +void omptarget_nvptx_Queue::DoneServing(uint32_t slot, + uint32_t id) { __kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID); } template -INLINE void -omptarget_nvptx_Queue::Enqueue(ElementType *element) { +void omptarget_nvptx_Queue::Enqueue(ElementType *element) { uint32_t ticket = ENQUEUE_TICKET(); uint32_t slot = ticket % SIZE; uint32_t id = ID(ticket) + 1; @@ -75,7 +72,7 @@ } template -INLINE ElementType *omptarget_nvptx_Queue::Dequeue() { +ElementType *omptarget_nvptx_Queue::Dequeue() { uint32_t ticket = DEQUEUE_TICKET(); uint32_t slot = ticket % SIZE; uint32_t id = ID(ticket); diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -31,59 +31,59 @@ RuntimeMask = 0x02u, }; -DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode); -DEVICE bool isGenericMode(); -DEVICE bool isSPMDMode(); -DEVICE bool isRuntimeUninitialized(); -DEVICE bool isRuntimeInitialized(); +void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode); +bool isGenericMode(); +bool isSPMDMode(); +bool isRuntimeUninitialized(); +bool isRuntimeInitialized(); //////////////////////////////////////////////////////////////////////////////// // Execution Modes based on location parameter fields //////////////////////////////////////////////////////////////////////////////// -DEVICE bool checkSPMDMode(kmp_Ident *loc); -DEVICE bool checkGenericMode(kmp_Ident *loc); -DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc); -DEVICE bool checkRuntimeInitialized(kmp_Ident *loc); +bool checkSPMDMode(kmp_Ident *loc); +bool checkGenericMode(kmp_Ident *loc); +bool checkRuntimeUninitialized(kmp_Ident *loc); +bool checkRuntimeInitialized(kmp_Ident *loc); //////////////////////////////////////////////////////////////////////////////// // get info from machine //////////////////////////////////////////////////////////////////////////////// // get global ids to locate tread/team info (constant regardless of OMP) -DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); -DEVICE int GetMasterThreadID(); -DEVICE int GetNumberOfWorkersInTeam(); +int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); +int GetMasterThreadID(); +int GetNumberOfWorkersInTeam(); // get OpenMP thread and team ids -DEVICE int GetOmpThreadId(int threadId, - bool isSPMDExecutionMode); // omp_thread_num -DEVICE int GetOmpTeamId(); // omp_team_num +int GetOmpThreadId(int threadId, + bool isSPMDExecutionMode); // omp_thread_num +int GetOmpTeamId(); // omp_team_num // get OpenMP number of threads and team -DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads -DEVICE int GetNumberOfOmpTeams(); // omp_num_teams +int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads +int GetNumberOfOmpTeams(); // omp_num_teams // get OpenMP number of procs -DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); -DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); +int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); +int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); // masters -DEVICE int IsTeamMaster(int ompThreadId); +int IsTeamMaster(int ompThreadId); // Parallel level -DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); -DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); +// void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); +// void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); //////////////////////////////////////////////////////////////////////////////// // Memory //////////////////////////////////////////////////////////////////////////////// // safe alloc and free -DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success -DEVICE void *SafeFree(void *ptr, const char *msg); +void *SafeMalloc(size_t size, const char *msg); // check if success +void *SafeFree(void *ptr, const char *msg); // pad to a alignment (power of 2 only) -DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment); +unsigned long PadBytes(unsigned long size, unsigned long alignment); #define ADD_BYTES(_addr, _bytes) \ ((void *)((char *)((void *)(_addr)) + (_bytes))) #define SUB_BYTES(_addr, _bytes) \ @@ -92,7 +92,7 @@ //////////////////////////////////////////////////////////////////////////////// // Teams Reduction Scratchpad Helpers //////////////////////////////////////////////////////////////////////////////// -DEVICE unsigned int *GetTeamsReductionTimestamp(); -DEVICE char *GetTeamsReductionScratchpad(); +unsigned int *GetTeamsReductionTimestamp(); +char *GetTeamsReductionScratchpad(); #endif diff --git a/openmp/libomptarget/deviceRTLs/common/target_atomic.h b/openmp/libomptarget/deviceRTLs/common/target_atomic.h --- a/openmp/libomptarget/deviceRTLs/common/target_atomic.h +++ b/openmp/libomptarget/deviceRTLs/common/target_atomic.h @@ -13,25 +13,25 @@ #ifndef OMPTARGET_TARGET_ATOMIC_H #define OMPTARGET_TARGET_ATOMIC_H -#include "target_impl.h" +//#include "target_impl.h" -template INLINE T __kmpc_atomic_add(T *address, T val) { +template T __kmpc_atomic_add(T *address, T val) { return atomicAdd(address, val); } -template INLINE T __kmpc_atomic_inc(T *address, T val) { +template T __kmpc_atomic_inc(T *address, T val) { return atomicInc(address, val); } -template INLINE T __kmpc_atomic_max(T *address, T val) { +template T __kmpc_atomic_max(T *address, T val) { return atomicMax(address, val); } -template INLINE T __kmpc_atomic_exchange(T *address, T val) { +template T __kmpc_atomic_exchange(T *address, T val) { return atomicExch(address, val); } -template INLINE T __kmpc_atomic_cas(T *address, T compare, T val) { +template T __kmpc_atomic_cas(T *address, T compare, T val) { return atomicCAS(address, compare, val); } diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -47,51 +47,52 @@ omp_proc_bind_spread = 4 } omp_proc_bind_t; -EXTERN double omp_get_wtick(void); -EXTERN double omp_get_wtime(void); - -EXTERN void omp_set_num_threads(int num); -EXTERN int omp_get_num_threads(void); -EXTERN int omp_get_max_threads(void); -EXTERN int omp_get_thread_limit(void); -EXTERN int omp_get_thread_num(void); -EXTERN int omp_get_num_procs(void); -EXTERN int omp_in_parallel(void); -EXTERN int omp_in_final(void); -EXTERN void omp_set_dynamic(int flag); -EXTERN int omp_get_dynamic(void); -EXTERN void omp_set_nested(int flag); -EXTERN int omp_get_nested(void); -EXTERN void omp_set_max_active_levels(int level); -EXTERN int omp_get_max_active_levels(void); -EXTERN int omp_get_level(void); -EXTERN int omp_get_active_level(void); -EXTERN int omp_get_ancestor_thread_num(int level); -EXTERN int omp_get_team_size(int level); - -EXTERN void omp_init_lock(omp_lock_t *lock); -EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock); -EXTERN void omp_destroy_lock(omp_lock_t *lock); -EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock); -EXTERN void omp_set_lock(omp_lock_t *lock); -EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock); -EXTERN void omp_unset_lock(omp_lock_t *lock); -EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock); -EXTERN int omp_test_lock(omp_lock_t *lock); -EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock); - -EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier); -EXTERN void omp_set_schedule(omp_sched_t kind, int modifier); -EXTERN omp_proc_bind_t omp_get_proc_bind(void); -EXTERN int omp_get_cancellation(void); -EXTERN void omp_set_default_device(int deviceId); -EXTERN int omp_get_default_device(void); -EXTERN int omp_get_num_devices(void); -EXTERN int omp_get_num_teams(void); -EXTERN int omp_get_team_num(void); -EXTERN int omp_is_initial_device(void); -EXTERN int omp_get_initial_device(void); -EXTERN int omp_get_max_task_priority(void); +double omp_get_wtick(void); +double omp_get_wtime(void); + +void omp_set_num_threads(int num); +int omp_get_num_threads(void); +int omp_get_max_threads(void); +int omp_get_thread_limit(void); +int omp_get_thread_num(void); +int omp_get_num_procs(void); +int omp_in_parallel(void); +int omp_in_final(void); +void omp_set_dynamic(int flag); +int omp_get_dynamic(void); +void omp_set_nested(int flag); +int omp_get_nested(void); +void omp_set_max_active_levels(int level); +int omp_get_max_active_levels(void); +int omp_get_level(void); +int omp_get_active_level(void); +int omp_get_ancestor_thread_num(int level); +int omp_get_team_size(int level); + +struct omp_lock_t; +void omp_init_lock(omp_lock_t *lock); +void omp_init_nest_lock(omp_nest_lock_t *lock); +void omp_destroy_lock(omp_lock_t *lock); +void omp_destroy_nest_lock(omp_nest_lock_t *lock); +void omp_set_lock(omp_lock_t *lock); +void omp_set_nest_lock(omp_nest_lock_t *lock); +void omp_unset_lock(omp_lock_t *lock); +void omp_unset_nest_lock(omp_nest_lock_t *lock); +int omp_test_lock(omp_lock_t *lock); +int omp_test_nest_lock(omp_nest_lock_t *lock); + +void omp_get_schedule(omp_sched_t *kind, int *modifier); +void omp_set_schedule(omp_sched_t kind, int modifier); +omp_proc_bind_t omp_get_proc_bind(void); +int omp_get_cancellation(void); +void omp_set_default_device(int deviceId); +int omp_get_default_device(void); +int omp_get_num_devices(void); +int omp_get_num_teams(void); +int omp_get_team_num(void); +int omp_is_initial_device(void); +int omp_get_initial_device(void); +int omp_get_max_task_priority(void); //////////////////////////////////////////////////////////////////////////////// // file below is swiped from kmpc host interface @@ -217,235 +218,214 @@ //////////////////////////////////////////////////////////////////////////////// // parallel -EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc); -EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid, - int32_t num_threads); -EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid); -EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, - uint32_t global_tid); -EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid); +int32_t __kmpc_global_thread_num(kmp_Ident *loc); +void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid, + int32_t num_threads); +void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid); +void __kmpc_end_serialized_parallel(kmp_Ident *loc, uint32_t global_tid); +uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid); // proc bind -EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid, - int proc_bind); -EXTERN int omp_get_num_places(void); -EXTERN int omp_get_place_num_procs(int place_num); -EXTERN void omp_get_place_proc_ids(int place_num, int *ids); -EXTERN int omp_get_place_num(void); -EXTERN int omp_get_partition_num_places(void); -EXTERN void omp_get_partition_place_nums(int *place_nums); +void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid, int proc_bind); +int omp_get_num_places(void); +int omp_get_place_num_procs(int place_num); +void omp_get_place_proc_ids(int place_num, int *ids); +int omp_get_place_num(void); +int omp_get_partition_num_places(void); +void omp_get_partition_place_nums(int *place_nums); // for static (no chunk or chunk) -EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter1, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN +void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, int32_t sched, + int32_t *plastiter, int32_t *plower, + int32_t *pupper, int32_t *pstride, int32_t incr, + int32_t chunk); +void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, int32_t chunk); +void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, int32_t sched, + int32_t *plastiter, int64_t *plower, + int64_t *pupper, int64_t *pstride, int64_t incr, + int64_t chunk); +void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter1, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, int64_t chunk); void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk); -EXTERN void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk); -EXTERN void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk); -EXTERN void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1, uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk); -EXTERN void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk); -EXTERN void __kmpc_for_static_init_4u_simple_generic( kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk); -EXTERN void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk); -EXTERN void __kmpc_for_static_init_8u_simple_generic( kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1, uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk); -EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid); +void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid); // for dynamic -EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t lower, int32_t upper, - int32_t incr, int32_t chunk); -EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, uint32_t lower, - uint32_t upper, int32_t incr, - int32_t chunk); -EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int64_t lower, int64_t upper, - int64_t incr, int64_t chunk); -EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, uint64_t lower, - uint64_t upper, int64_t incr, - int64_t chunk); - -EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid, - int32_t *plastiter, int32_t *plower, - int32_t *pupper, int32_t *pstride); -EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid, - int32_t *plastiter, uint32_t *plower, - uint32_t *pupper, int32_t *pstride); -EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid, - int32_t *plastiter, int64_t *plower, - int64_t *pupper, int64_t *pstride); -EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid, - int32_t *plastiter, uint64_t *plower, - uint64_t *pupper, int64_t *pstride); - -EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid); +void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid, int32_t sched, + int32_t lower, int32_t upper, int32_t incr, + int32_t chunk); +void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid, int32_t sched, + uint32_t lower, uint32_t upper, int32_t incr, + int32_t chunk); +void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid, int32_t sched, + int64_t lower, int64_t upper, int64_t incr, + int64_t chunk); +void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid, int32_t sched, + uint64_t lower, uint64_t upper, int64_t incr, + int64_t chunk); + +int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid, + int32_t *plastiter, int32_t *plower, int32_t *pupper, + int32_t *pstride); +int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid, + int32_t *plastiter, uint32_t *plower, + uint32_t *pupper, int32_t *pstride); +int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid, + int32_t *plastiter, int64_t *plower, int64_t *pupper, + int64_t *pstride); +int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid, + int32_t *plastiter, uint64_t *plower, + uint64_t *pupper, int64_t *pstride); + +void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid); +void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid); +void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid); +void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid); // reduction -EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid); -EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); -EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( +void __kmpc_nvptx_end_reduce(int32_t global_tid); +void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); +int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( +int32_t __kmpc_nvptx_teams_reduce_nowait_v2( kmp_Ident *loc, int32_t global_tid, void *global_buffer, int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct, kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct, kmp_ListGlobalFctPtr glredFct); -EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); -EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); +int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); +int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); // sync barrier -EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid); -EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid); -EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid); +void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid); +void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid); +int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid); // single -EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid); +int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid); +void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid); // sync -EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, - kmp_CriticalName *crit); -EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid, - kmp_CriticalName *crit); -EXTERN void __kmpc_flush(kmp_Ident *loc); +int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid); +void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid); +void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid); +void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid); +void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *crit); +void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *crit); +void __kmpc_flush(kmp_Ident *loc); // vote -EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask(); +//__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask(); // syncwarp -EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t); +// void __kmpc_syncwarp(__kmpc_impl_lanemask_t); // tasks -EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, - uint32_t global_tid, int32_t flag, - size_t sizeOfTaskInclPrivate, - size_t sizeOfSharedTable, - kmp_TaskFctPtr sub); -EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid, +kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, uint32_t global_tid, + int32_t flag, size_t sizeOfTaskInclPrivate, + size_t sizeOfSharedTable, + kmp_TaskFctPtr sub); +int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr); +int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); +void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newLegacyTaskDescr); -EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newLegacyTaskDescr, - int32_t depNum, void *depList, - int32_t noAliasDepNum, - void *noAliasDepList); -EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newLegacyTaskDescr); -EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newLegacyTaskDescr); -EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid, - int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList); -EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid); -EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid); -EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid, - int end_part); -EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid); -EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr, int if_val, - uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, - int32_t sched, uint64_t grainsize, void *task_dup); +void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr); +void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid, int32_t depNum, + void *depList, int32_t noAliasDepNum, + void *noAliasDepList); +void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid); +void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid); +int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid, int end_part); +int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid); +void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr, int if_val, uint64_t *lb, + uint64_t *ub, int64_t st, int nogroup, int32_t sched, + uint64_t grainsize, void *task_dup); // cancel -EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, - int32_t cancelVal); -EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, - int32_t cancelVal); +int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, + int32_t cancelVal); +int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, int32_t cancelVal); // non standard -EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime); -EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); -EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, - int16_t RequiresOMPRuntime); -EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); -EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn); -EXTERN bool __kmpc_kernel_parallel(void **WorkFn); -EXTERN void __kmpc_kernel_end_parallel(); - -EXTERN void __kmpc_data_sharing_init_stack(); -EXTERN void __kmpc_data_sharing_init_stack_spmd(); -EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, - int16_t UseSharedMemory); -EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); -EXTERN void __kmpc_data_sharing_pop_stack(void *a); -EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); -EXTERN void __kmpc_end_sharing_variables(); -EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); +void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime); +void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); +void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime); +void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); +void __kmpc_kernel_prepare_parallel(void *WorkFn); +bool __kmpc_kernel_parallel(void **WorkFn); +void __kmpc_kernel_end_parallel(); + +void __kmpc_data_sharing_init_stack(); +void __kmpc_data_sharing_init_stack_spmd(); +void *__kmpc_data_sharing_coalesced_push_stack(size_t size, + int16_t UseSharedMemory); +void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); +void __kmpc_data_sharing_pop_stack(void *a); +void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); +void __kmpc_end_sharing_variables(); +void __kmpc_get_shared_variables(void ***GlobalArgs); // SPMD execution mode interrogation function. -EXTERN int8_t __kmpc_is_spmd_exec_mode(); +int8_t __kmpc_is_spmd_exec_mode(); -EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, - const void *buf, size_t size, - int16_t is_shared, const void **res); +void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, const void *buf, + size_t size, int16_t is_shared, + const void **res); -EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, - int16_t is_shared); +void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, + int16_t is_shared); #endif