diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -47,8 +47,6 @@ #define OMP_STATE_COUNT 32 #define MAX_SM 64 -#define OMP_ACTIVE_PARALLEL_LEVEL 128 - // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { // The maximum number of workers in a kernel. diff --git a/openmp/libomptarget/deviceRTLs/common/include/ICVs.h b/openmp/libomptarget/deviceRTLs/common/include/ICVs.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/include/ICVs.h @@ -0,0 +1,113 @@ +//===--------- ICVs.h - OpenMP ICV handling ----------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// +#ifndef OMPTARGET_ICVS_H +#define OMPTARGET_ICVS_H + +struct ICVStateTy { + int nthreads_var; + + int levels_var; + + /// The `active-level` describes which of the parallel levels counted with the + /// `levels-var` is active. There can only be one. + /// + /// active-levels-var is 1, if active_level is not 0, otherweise it is 0. + int active_level; + + static bool ensureICVStateForThread(unsigned TId); + + static int &getICVForThread(int ICVStateTy::*Var); + static int incICVForThread(int ICVStateTy::*Var, int UpdateVal); + static int setICVForThread(int ICVStateTy::*Var, int UpdateVal); +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#pragma omp declare target + +/// ICV: dyn-var, constant 0 +/// +/// setter: ignored. +/// getter: returns 0. +/// +///{ +void omp_set_dynamic(int); +int omp_get_dynamic(void); +///} + +/// ICV: nthreads-var, integer +/// +/// scope: data environment +/// +/// setter: ignored. +/// getter: returns false. +/// +/// implementation notes: +/// +/// +///{ +void omp_set_num_threads(int); +int omp_get_max_threads(void); +///} + +/// ICV: thread-limit-var, computed +/// +/// getter: returns thread limited defined during launch. +/// +///{ +int omp_get_thread_limit(void); +///} + +/// ICV: max-active-levels-var, constant 1 +/// +/// setter: ignored. +/// getter: returns 1. +/// +///{ +void omp_set_max_active_levels(int); +int omp_get_max_active_levels(void); +///} + +/// ICV: places-partition-var +/// +/// +///{ +///} + +/// ICV: active-levels-var, 0 or 1 +/// +/// getter: returns 0 or 1. +/// +///{ +int omp_get_active_levels(void); +///} + +/// ICV: levels-var +/// +/// getter: returns parallel region nesting +/// +///{ +int omp_get_levels(void); +///} + +/// TODO this is incomplete. +int omp_get_num_threads(void); +int omp_get_thread_num(void); + +#pragma omp end declare target + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/include/Mapping.h b/openmp/libomptarget/deviceRTLs/common/include/Mapping.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/include/Mapping.h @@ -0,0 +1,30 @@ +//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// +#ifndef OMPTARGET_MAPPING_H +#define OMPTARGET_MAPPING_H + +namespace omp { + +#pragma omp declare target + +/// TODO +bool isMainThreadInGenericMode(); + +bool isLeaderInSIMD(); + +unsigned getNumberOfThreadsAccessingSharedMem(); +unsigned getThreadIdForSharedMemArrayAccess(); + +#pragma omp end declare target + +} // namespace omp + +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/include/TeamState.h b/openmp/libomptarget/deviceRTLs/common/include/TeamState.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/include/TeamState.h @@ -0,0 +1,46 @@ +//===--------- TeamState.h - OpenMP team state description -------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// +#ifndef OMPTARGET_TEAM_STATE_H +#define OMPTARGET_TEAM_STATE_H + +#include + +#include "ICVs.h" +#include "allocator.h" + +namespace omp { + +#pragma omp declare target + +struct TeamStateTy { + + /// TODO: provide a proper init function. + void init() {} + + /// ICVs + /// + /// Preallocated storage for ICV values that are used if the threads have not + /// set a custom default. The latter is supported but unlikely and slow(er). + /// + ///{ + ICVStateTy ICVState; + ///} + + uint16_t ParallelTeamSize; +}; + +extern TeamStateTy EXTERN_SHARED(TeamState); + +#pragma omp end declare target + +} // namespace omp + +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/include/ThreadState.h b/openmp/libomptarget/deviceRTLs/common/include/ThreadState.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/include/ThreadState.h @@ -0,0 +1,59 @@ +//===--------- ThreadState.h - OpenMP thread state description ---- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// +#ifndef OMPTARGET_THREAD_STATE_H +#define OMPTARGET_THREAD_STATE_H + +#include "ICVs.h" +#include "TeamState.h" +#include "Utils.h" +#include "allocator.h" + +namespace omp { + +#pragma omp declare target + +struct ThreadStateTy { + + /// ICVs have preallocated storage in the TeamStateTy which is used if a + /// thread has not set a custom value. The latter is supported but unlikely. + /// When it happens we will allocate dynamic memory to hold the values of all + /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an + /// ICV struct to hold them all. This is slower than alternatives but allows + /// users to pay only for what they use. + /// + ICVStateTy ICVState; + + ThreadStateTy *PreviousThreadState; + + void init() { + ICVState = TeamState.ICVState; + PreviousThreadState = nullptr; + } + + void init(ThreadStateTy &PreviousTS) { + ICVState = PreviousTS.ICVState; + PreviousThreadState = &PreviousTS; + } + + static void dropForThread(unsigned TId); + + static void enterDataEnvironment(); + static void exitDataEnvironment(); +}; + +extern ThreadStateTy *ThreadStates[MAX_THREADS_PER_TEAM]; +#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) + +#pragma omp end declare target + +} // namespace omp + +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/include/Utils.h b/openmp/libomptarget/deviceRTLs/common/include/Utils.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/include/Utils.h @@ -0,0 +1,28 @@ +//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// +#ifndef OMPTARGET_UTILS_H +#define OMPTARGET_UTILS_H + +#include + +#include "ICVs.h" +#include "Mapping.h" +#include "target_interface.h" + +namespace omp { + +//#pragma omp declare target + +//#pragma omp end declare target + +} // namespace omp + +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -234,10 +234,6 @@ topTaskDescr[tid] = taskICV; } INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const; - // parallel - INLINE uint16_t &NumThreadsForNextParallel(int tid) { - return nextRegion.tnum[tid]; - } // schedule (for dispatch) INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } INLINE int64_t &Chunk(int tid) { return chunk[tid]; } @@ -257,11 +253,6 @@ omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; // pointer where to find the current task ICV (top of the stack) omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; - union { - // Only one of the two is live at the same time. - // parallel - uint16_t tnum[MAX_THREADS_PER_TEAM]; - } nextRegion; // schedule (for dispatch) kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for int64_t chunk[MAX_THREADS_PER_TEAM]; @@ -298,16 +289,6 @@ omptarget_nvptx_simpleMemoryManager; extern DEVICE uint32_t EXTERN_SHARED(usedMemIdx); extern DEVICE uint32_t EXTERN_SHARED(usedSlotIdx); -#if _OPENMP -extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) -#else -extern DEVICE - uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE]; -#endif -extern DEVICE uint16_t EXTERN_SHARED(threadLimit); -extern DEVICE uint16_t EXTERN_SHARED(threadsInTeam); -extern DEVICE uint16_t EXTERN_SHARED(nThreads); extern DEVICE omptarget_nvptx_ThreadPrivateContext * EXTERN_SHARED(omptarget_nvptx_threadPrivateContext); diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h --- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h +++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h @@ -157,8 +157,6 @@ // levelOneTaskDescr is init when starting the parallel region // top task descr is NULL (team master version will be fixed separately) topTaskDescr[tid] = NULL; - // no num threads value has been pushed - nextRegion.tnum[tid] = 0; // the following don't need to be init here; they are init when using dyn // sched // current_Event, events_Number, chunk, num_Iterations, schedule diff --git a/openmp/libomptarget/deviceRTLs/common/src/ICVs.cpp b/openmp/libomptarget/deviceRTLs/common/src/ICVs.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/src/ICVs.cpp @@ -0,0 +1,206 @@ +//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the OpenMP runtime functions that can be +// invoked by the user in an OpenMP region +// +//===----------------------------------------------------------------------===// + +#include "ICVs.h" +#include "Mapping.h" +#include "TeamState.h" +#include "ThreadState.h" +#include "omptarget.h" +#include "support.h" +#include "target_interface.h" +#include + +#define ICV_DEBUG(...) + +using namespace omp; + +#pragma omp declare target + +template static Ty &getICVForThreadImpl(Ty ICVStateTy::*Var) { + unsigned TId = getThreadIdForSharedMemArrayAccess(); + if (!ThreadStates[TId]) + return TeamState.ICVState.*Var; + + return ThreadStates[TId]->ICVState.*Var; +} + +int &ICVStateTy::getICVForThread(int ICVStateTy::*Var) { + return getICVForThreadImpl(Var); +} + +template +static Ty incICVForThreadImpl(Ty ICVStateTy::*Var, Ty UpdateVal) { + unsigned TId = getThreadIdForSharedMemArrayAccess(); + ICVStateTy::ensureICVStateForThread(TId); + + ThreadStates[TId]->ICVState.*Var += UpdateVal; + + return ThreadStates[TId]->ICVState.*Var; +} +int ICVStateTy::incICVForThread(int ICVStateTy::*Var, int UpdateVal) { + return incICVForThreadImpl(Var, UpdateVal); +} + +template +static Ty setICVForThreadImpl(Ty ICVStateTy::*Var, Ty UpdateVal) { + unsigned TId = getThreadIdForSharedMemArrayAccess(); + ICVStateTy::ensureICVStateForThread(TId); + + ThreadStates[TId]->ICVState.*Var = UpdateVal; + + return ThreadStates[TId]->ICVState.*Var; +} +int ICVStateTy::setICVForThread(int ICVStateTy::*Var, int UpdateVal) { + return setICVForThreadImpl(Var, UpdateVal); +} + +bool ICVStateTy::ensureICVStateForThread(unsigned TId) { + if (ThreadStates[TId]) + return false; + + ThreadStates[TId] = + static_cast(malloc(sizeof(ThreadStateTy))); + ThreadStates[TId]->init(); + return true; +} + +void omp_set_dynamic(int V) { ICV_DEBUG("(%i); ignored", V); } +int omp_get_dynamic(void) { + ICV_DEBUG("0; constant"); + return 0; +} + +void omp_set_num_threads(int V) { + if (isMainThreadInGenericMode()) { + ICV_DEBUG("(%i); stored in team state", V); + TeamState.ICVState.nthreads_var = V; + return; + } + + unsigned TId = getThreadIdForSharedMemArrayAccess(); + if (!ThreadStates[TId] && TeamState.ICVState.nthreads_var == V) { + ICV_DEBUG("(%i); equal to team state setting, ignored", V); + return; + } + + ICVStateTy::ensureICVStateForThread(TId); + + ICV_DEBUG("(%i); set nthreads-var ICV for thread", V); + ThreadStates[TId]->ICVState.nthreads_var = V; +} + +int omp_get_max_threads(void) { + return ICVStateTy::getICVForThread(&ICVStateTy::nthreads_var); +} + +/// TODO not all functions below belong here. + +int omp_get_level(void) { + int LevelsVar = ICVStateTy::getICVForThread(&ICVStateTy::levels_var); + __builtin_assume(LevelsVar >= 0); + return LevelsVar; +} + +int omp_get_active_level(void) { + return !!ICVStateTy::getICVForThread(&ICVStateTy::active_level); +} + +int omp_in_parallel(void) { + return !!ICVStateTy::getICVForThread(&ICVStateTy::active_level); +} + +static int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, + int OutOfBoundsVal = -1) { + if (Level == 0) + return DefaultVal; + int LevelsVar = omp_get_level(); + if (Level < 0 || Level > LevelsVar) + return OutOfBoundsVal; + int ActiveLevel = ICVStateTy::getICVForThread(&ICVStateTy::active_level); + if (Level != ActiveLevel) + return DefaultVal; + return Val; +} + +int omp_get_ancestor_thread_num(int Level) { + return returnValIfLevelIsActive(Level, getThreadIdForSharedMemArrayAccess(), + 0); +} + +int omp_get_thread_num(void) { + return omp_get_ancestor_thread_num(omp_get_level()); +} + +int omp_get_team_size(int Level) { + return returnValIfLevelIsActive(Level, TeamState.ParallelTeamSize, 1); +} + +int omp_get_num_threads(void) { return omp_get_team_size(omp_get_level()); } + +void ThreadStateTy::enterDataEnvironment() { + unsigned TId = getThreadIdForSharedMemArrayAccess(); + if (ICVStateTy::ensureICVStateForThread(TId)) + return; + + ThreadStateTy *NewThreadState = + static_cast(malloc(sizeof(ThreadStateTy))); + NewThreadState->init(*ThreadStates[TId]); + ThreadStates[TId] = NewThreadState; +} + +void ThreadStateTy::exitDataEnvironment() { + unsigned TId = getThreadIdForSharedMemArrayAccess(); + // assert(ThreadStates[TId] && "exptected thread state"); + free(ThreadStates[TId]); + ThreadStates[TId] = ThreadStates[TId]->PreviousThreadState; +} + +bool omp::isMainThreadInGenericMode() { + if (isSPMDMode()) + return false; + + int TId = GetThreadIdInBlock(); + return TId == ((GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1)); +} + +bool omp::isLeaderInSIMD() { + __kmpc_impl_lanemask_t Active = __kmpc_impl_activemask(); + __kmpc_impl_lanemask_t LaneMaskLT = __kmpc_impl_lanemask_lt(); + unsigned int Position = __kmpc_impl_popc(Active & LaneMaskLT); + return Position == 0; +} + +unsigned omp::getNumberOfThreadsAccessingSharedMem() { + return GetNumberOfThreadsInBlock(); +} + +unsigned omp::getThreadIdForSharedMemArrayAccess() { + return GetThreadIdInBlock(); +} + +void ThreadStateTy::dropForThread(unsigned TId) { + if (!ThreadStates[TId]) + return; + + // assert(!ThreadStates[TId]->PreviousThreadState && "leftover thread state"); + free(ThreadStates[TId]); + ThreadStates[TId] = nullptr; +} + +DEVICE TeamStateTy SHARED(omp::TeamState); + +[[clang::loader_uninitialized]] DEVICE ThreadStateTy + *omp::ThreadStates[MAX_THREADS_PER_TEAM]; +#pragma omp allocate(omp::ThreadStates) allocator(omp_pteam_mem_alloc) + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -27,66 +27,20 @@ return rc; } -EXTERN void omp_set_num_threads(int num) { - // Ignore it for SPMD mode. - if (isSPMDMode()) - return; - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num); - if (num <= 0) { - WARNING0(LW_INPUT, "expected positive num; ignore\n"); - } else if (parallelLevel[GetWarpId()] == 0) { - nThreads = num; - } -} - -EXTERN int omp_get_num_threads(void) { - int rc = GetNumberOfOmpThreads(isSPMDMode()); - PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc); - return rc; -} - -EXTERN int omp_get_max_threads(void) { - if (parallelLevel[GetWarpId()] > 0) - // We're already in parallel region. - return 1; // default is 1 thread avail - // Not currently in a parallel region, return what was set. - int rc = 1; - if (parallelLevel[GetWarpId()] == 0) - rc = nThreads; - ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads"); - PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc); - return rc; -} - EXTERN int omp_get_thread_limit(void) { if (isSPMDMode()) return GetNumberOfThreadsInBlock(); - int rc = threadLimit; + int rc = GetNumberOfWorkersInTeam(); PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc); return rc; } -EXTERN int omp_get_thread_num() { - bool isSPMDExecutionMode = isSPMDMode(); - int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - int rc = GetOmpThreadId(tid, isSPMDExecutionMode); - PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc); - return rc; -} - EXTERN int omp_get_num_procs(void) { int rc = GetNumberOfProcsInDevice(isSPMDMode()); PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc); return rc; } -EXTERN int omp_in_parallel(void) { - int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; - PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc); - return rc; -} - EXTERN int omp_in_final(void) { // treat all tasks as final... Specs may expect runtime to keep // track more precisely if a task was actively set by users... This @@ -97,16 +51,6 @@ return rc; } -EXTERN void omp_set_dynamic(int flag) { - PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag); -} - -EXTERN int omp_get_dynamic(void) { - int rc = 0; - PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc); - return rc; -} - EXTERN void omp_set_nested(int flag) { PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n", flag); @@ -130,91 +74,6 @@ return rc; } -EXTERN int omp_get_level(void) { - int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); - PRINT(LD_IO, "call omp_get_level() returns %d\n", level); - return level; -} - -EXTERN int omp_get_active_level(void) { - int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; - PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level) - return level; -} - -EXTERN int omp_get_ancestor_thread_num(int level) { - if (isSPMDMode()) - return level == 1 ? GetThreadIdInBlock() : 0; - int rc = -1; - // If level is 0 or all parallel regions are not active - return 0. - unsigned parLevel = parallelLevel[GetWarpId()]; - if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { - int totLevel = omp_get_level(); - if (level <= totLevel) { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); - int steps = totLevel - level; - PRINT(LD_IO, "backtrack %d steps\n", steps); - ASSERT0(LT_FUSSY, currTaskDescr, - "do not expect fct to be called in a non-active thread"); - do { - if (DON(LD_IOD)) { - // print current state - omp_sched_t sched = currTaskDescr->GetRuntimeSched(); - PRINT(LD_ALL, - "task descr %s %d: %s, in par %d, rt sched %d," - " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n", - "ancestor", steps, - (currTaskDescr->IsParallelConstruct() ? "par" : "task"), - (int)currTaskDescr->InParallelRegion(), (int)sched, - currTaskDescr->RuntimeChunkSize(), - (int)currTaskDescr->ThreadId(), (int)threadsInTeam, - (int)nThreads); - } - - if (currTaskDescr->IsParallelConstruct()) { - // found the level - if (!steps) { - rc = currTaskDescr->ThreadId(); - break; - } - steps--; - } - currTaskDescr = currTaskDescr->GetPrevTaskDescr(); - } while (currTaskDescr); - ASSERT0(LT_FUSSY, !steps, "expected to find all steps"); - } - } else if (level == 0 || - (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && - level <= parLevel) || - (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL && - level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) { - rc = 0; - } - PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level, - rc) - return rc; -} - -EXTERN int omp_get_team_size(int level) { - if (isSPMDMode()) - return level == 1 ? GetNumberOfThreadsInBlock() : 1; - int rc = -1; - unsigned parLevel = parallelLevel[GetWarpId()]; - // If level is 0 or all parallel regions are not active - return 1. - if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { - rc = threadsInTeam; - } else if (level == 0 || - (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && - level <= parLevel) || - (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL && - level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) { - rc = 1; - } - PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc) - return rc; -} - EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) { if (isRuntimeUninitialized()) { ASSERT0(LT_FUSSY, isSPMDMode(), diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -11,6 +11,7 @@ // interface as loops. // //===----------------------------------------------------------------------===// +#include "ICVs.h" #pragma omp declare target #include "common/omptarget.h" @@ -103,7 +104,7 @@ // Assume we are in teams region or that we use a single block // per target region - ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode); + ST numberOfActiveOMPThreads = omp_get_num_threads(); // All warps that are in excess of the maximum requested, do // not execute the loop @@ -211,7 +212,7 @@ } int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); - T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc)); + T tnum = omp_get_num_threads(); T tripCount = ub - lb + 1; // +1 because ub is inclusive ASSERT0(LT_FUSSY, threadId < tnum, "current thread is not needed here; error"); @@ -453,7 +454,7 @@ // automatically selects thread or warp ID based on selected implementation int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)), + ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(), "current thread is not needed here; error"); // retrieve schedule kmp_sched_t schedule = @@ -507,9 +508,9 @@ PRINT(LD_LOOP, "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " "last %d\n", - (int)GetNumberOfOmpThreads(isSPMDMode()), - (int)GetNumberOfWorkersInTeam(), (long long)*plower, - (long long)*pupper, (long long)*pstride, (int)*plast); + (int)omp_get_num_threads(), (int)GetNumberOfWorkersInTeam(), + (long long)*plower, (long long)*pupper, (long long)*pstride, + (int)*plast); return DISPATCH_NOTFINISHED; } diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu @@ -34,13 +34,6 @@ DEVICE uint32_t SHARED(usedMemIdx); DEVICE uint32_t SHARED(usedSlotIdx); -// SHARED doesn't work with array so we add the attribute explicitly. -[[clang::loader_uninitialized]] DEVICE uint8_t - parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) -DEVICE uint16_t SHARED(threadLimit); -DEVICE uint16_t SHARED(threadsInTeam); -DEVICE uint16_t SHARED(nThreads); // Pointer to this team's OpenMP state object DEVICE omptarget_nvptx_ThreadPrivateContext * SHARED(omptarget_nvptx_threadPrivateContext); diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -11,8 +11,13 @@ //===----------------------------------------------------------------------===// #pragma omp declare target +#include "TeamState.h" +#include "ThreadState.h" #include "common/omptarget.h" #include "target_impl.h" +#include "target_interface.h" + +#include //////////////////////////////////////////////////////////////////////////////// // global data tables @@ -26,14 +31,12 @@ // init entry points //////////////////////////////////////////////////////////////////////////////// -EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { +EXTERN void __kmpc_kernel_init(int, int16_t RequiresOMPRuntime) { PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", OMPTARGET_NVPTX_VERSION); ASSERT0(LT_FUSSY, RequiresOMPRuntime, "Generic always requires initialized runtime."); setExecutionParameters(Generic, RuntimeInitialized); - for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I) - parallelLevel[I] = 0; int threadIdInBlock = GetThreadIdInBlock(); ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(), @@ -62,9 +65,16 @@ // set number of threads and thread limit in team to started value omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - nThreads = GetNumberOfThreadsInBlock(); - threadLimit = ThreadLimit; __kmpc_impl_target_init(); + + if (threadId == 0) { + omp::TeamState.ICVState.nthreads_var = GetNumberOfThreadsInBlock(); + omp::TeamState.ICVState.levels_var = 0; + omp::TeamState.ICVState.active_level = -1; + omp::TeamState.ParallelTeamSize = -1; + memset(omp::ThreadStates, 0, + GetNumberOfThreadsInBlock() * sizeof(omp::ThreadStates[0])); + } } EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { @@ -72,27 +82,26 @@ ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Generic always requires initialized runtime."); // Enqueue omp state object for use by another team. - int slot = usedSlotIdx; + int slot = __kmpc_impl_smid() % MAX_SM; omptarget_nvptx_device_State[slot].Enqueue( omptarget_nvptx_threadPrivateContext); // Done with work. Kill the workers. omptarget_nvptx_workFn = 0; } -EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, - int16_t RequiresOMPRuntime) { +EXTERN void __kmpc_spmd_kernel_init(int, int16_t RequiresOMPRuntime) { PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n"); setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized : RuntimeUninitialized); int threadId = GetThreadIdInBlock(); if (threadId == 0) { - usedSlotIdx = __kmpc_impl_smid() % MAX_SM; - parallelLevel[0] = - 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); - } else if (GetLaneId() == 0) { - parallelLevel[GetWarpId()] = - 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); + omp::TeamState.ICVState.nthreads_var = 1; + omp::TeamState.ICVState.levels_var = 1; + omp::TeamState.ICVState.active_level = GetNumberOfThreadsInBlock() > 1; + omp::TeamState.ParallelTeamSize = GetNumberOfThreadsInBlock(); + memset(omp::ThreadStates, 0, + GetNumberOfThreadsInBlock() * sizeof(omp::ThreadStates[0])); } if (!RequiresOMPRuntime) { // Runtime is not required - exit. diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -31,31 +31,27 @@ // To make a long story short... // //===----------------------------------------------------------------------===// +#include "ICVs.h" +#include "TeamState.h" +#include "ThreadState.h" +#include "interface.h" #pragma omp declare target #include "common/omptarget.h" #include "target_impl.h" +using namespace omp; + //////////////////////////////////////////////////////////////////////////////// // support for parallel that goes parallel (1 static level only) //////////////////////////////////////////////////////////////////////////////// -INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause, - uint16_t NThreadsICV, - uint16_t ThreadLimit) { - uint16_t ThreadsRequested = NThreadsICV; - if (NumThreadsClause != 0) { - ThreadsRequested = NumThreadsClause; - } - - uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam(); - if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) { - ThreadsAvailable = ThreadLimit; - } +INLINE static uint16_t determineNumberOfThreads() { + int NThreadsICV = ICVStateTy::getICVForThread(&ICVStateTy::nthreads_var); - uint16_t NumThreads = ThreadsAvailable; - if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) { - NumThreads = ThreadsRequested; + uint16_t NumThreads = GetNumberOfWorkersInTeam(); + if (NThreadsICV != 0 && NThreadsICV < NumThreads) { + NumThreads = NThreadsICV; } #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 @@ -92,26 +88,25 @@ return; } - uint16_t &NumThreadsClause = - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); - - uint16_t NumThreads = - determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit); - - if (NumThreadsClause != 0) { - // Reset request to avoid propagating to successive #parallel - NumThreadsClause = 0; - } + uint16_t NumThreads = determineNumberOfThreads(); + TeamState.ParallelTeamSize = NumThreads; - ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", - (int)NumThreads); ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), "only team master can create parallel"); // Set number of threads on work descriptor. omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr); - threadsInTeam = NumThreads; + + // We do *not* create a new data environment because all threads in the team + // that are active are now running this parallel region. They share the + // TeamState, which has an increase level-var and potentially active-level + // set, but they do not have individual ThreadStates yet. If they ever + // modify the ICVs beyond this point a ThreadStates will be allocated. + auto Level = TeamState.ICVState.levels_var += 1; + bool IsActiveParallelRegion = NumThreads > 1; + if (IsActiveParallelRegion) + TeamState.ICVState.active_level = Level; } // All workers call this function. Deactivate those not needed. @@ -122,6 +117,11 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn) { PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); + int threadId = GetThreadIdInBlock(); + // In case we have modified an ICV for this thread before a ThreadState was + // created. We drop it now to not contaminate the next parallel region. + ThreadStateTy::dropForThread(threadId); + // Work function and arguments for L1 parallel region. *WorkFn = omptarget_nvptx_workFn; @@ -131,14 +131,15 @@ return false; } + uint16_t NumThreads = determineNumberOfThreads(); + // Only the worker threads call this routine and the master warp // never arrives here. Therefore, use the nvptx thread id. - int threadId = GetThreadIdInBlock(); omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); // Set to true for workers participating in the parallel region. - bool isActive = false; + bool ThreadIsActive = threadId < NumThreads; // Initialize state for active threads. - if (threadId < threadsInTeam) { + if (ThreadIsActive) { // init work descriptor from workdesccr omptarget_nvptx_TaskDescr *newTaskDescr = omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); @@ -152,21 +153,9 @@ "thread will execute parallel region with id %d in a team of " "%d threads\n", (int)newTaskDescr->ThreadId(), (int)nThreads); - - isActive = true; - // Reconverge the threads at the end of the parallel region to correctly - // handle parallel levels. - // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole - // warp. If only 1 thread is active, not need to reconverge the threads. - // If we have the whole warp, reconverge all the threads in the warp before - // actually trying to change the parallel level. Otherwise, parallel level - // can be changed incorrectly because of threads divergence. - bool IsActiveParallelRegion = threadsInTeam != 1; - IncParallelLevel(IsActiveParallelRegion, - IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u); } - return isActive; + return ThreadIsActive; } EXTERN void __kmpc_kernel_end_parallel() { @@ -181,16 +170,13 @@ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); - // Reconverge the threads at the end of the parallel region to correctly - // handle parallel levels. - // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole - // warp. If only 1 thread is active, not need to reconverge the threads. - // If we have the whole warp, reconverge all the threads in the warp before - // actually trying to change the parallel level. Otherwise, parallel level can - // be changed incorrectly because of threads divergence. - bool IsActiveParallelRegion = threadsInTeam != 1; - DecParallelLevel(IsActiveParallelRegion, - IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u); + // We did *not* create a new data environment because all threads in the team + // that were active were running the parallel region. We used the TeamState + // which needs adjustment now. + TeamState.ICVState.levels_var -= 1; + bool IsActiveParallelRegion = omp_get_num_threads() > 1; + if (IsActiveParallelRegion) + TeamState.ICVState.active_level = 0; } //////////////////////////////////////////////////////////////////////////////// @@ -200,7 +186,8 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); - IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); + ThreadStateTy::enterDataEnvironment(); + ICVStateTy::incICVForThread(&ICVStateTy::levels_var, 1); if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), @@ -239,7 +226,8 @@ uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); - DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); + ThreadStateTy::exitDataEnvironment(); + ICVStateTy::incICVForThread(&ICVStateTy::levels_var, -1); if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), @@ -261,8 +249,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_parallel_level\n"); - - return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); + return omp_get_level(); } // This kmpc call returns the thread id across all teams. It's value is @@ -270,22 +257,16 @@ // it's cheap to recalculate this value so we never use the result // of this call. EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) { - int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - return GetOmpThreadId(tid, checkSPMDMode(loc)); + return omp_get_thread_num(); } //////////////////////////////////////////////////////////////////////////////// // push params //////////////////////////////////////////////////////////////////////////////// -EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid, - int32_t num_threads) { - PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); - ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), - "Runtime must be initialized."); - tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = - num_threads; +EXTERN void __kmpc_push_num_threads(kmp_Ident *Loc, int32_t TId, + int32_t NumThreads) { + ICVStateTy::setICVForThread(&ICVStateTy::nthreads_var, NumThreads); } // Do nothing. The host guarantees we started the requested number of diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -9,6 +9,7 @@ // This file contains the implementation of reduction with KMPC interface. // //===----------------------------------------------------------------------===// +#include "ICVs.h" #pragma omp declare target #include "common/omptarget.h" @@ -81,7 +82,7 @@ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, bool isSPMDExecutionMode, bool isRuntimeUninitialized) { uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode); + uint32_t NumThreads = omp_get_num_threads(); if (NumThreads == 1) return 1; /* @@ -204,8 +205,7 @@ // In generic mode only the team master participates in the teams // reduction because the workers are waiting for parallel work. uint32_t NumThreads = - checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true) - : /*Master thread only*/ 1; + checkSPMDMode(loc) ? omp_get_num_threads() : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel(); static unsigned SHARED(Bound); diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -136,37 +136,6 @@ // //////////////////////////////////////////////////////////////////////////////// -DEVICE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) { - // omp_thread_num - int rc; - if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { - rc = 0; - } else if (isSPMDExecutionMode) { - rc = GetThreadIdInBlock(); - } else { - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); - rc = currTaskDescr->ThreadId(); - } - return rc; -} - -DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) { - // omp_num_threads - int rc; - int Level = parallelLevel[GetWarpId()]; - if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) { - rc = 1; - } else if (isSPMDExecutionMode) { - rc = GetNumberOfThreadsInBlock(); - } else { - rc = threadsInTeam; - } - - return rc; -} - //////////////////////////////////////////////////////////////////////////////// // Team id linked to OpenMP @@ -185,33 +154,6 @@ DEVICE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } -//////////////////////////////////////////////////////////////////////////////// -// Parallel level - -DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { - __kmpc_impl_syncwarp(Mask); - __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); - unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); - if (Rank == 0) { - parallelLevel[GetWarpId()] += - (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); - __kmpc_impl_threadfence(); - } - __kmpc_impl_syncwarp(Mask); -} - -DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { - __kmpc_impl_syncwarp(Mask); - __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); - unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); - if (Rank == 0) { - parallelLevel[GetWarpId()] -= - (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); - __kmpc_impl_threadfence(); - } - __kmpc_impl_syncwarp(Mask); -} - //////////////////////////////////////////////////////////////////////////////// // get OpenMP number of procs diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu --- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu @@ -9,6 +9,7 @@ // Include all synchronization. // //===----------------------------------------------------------------------===// +#include "ICVs.h" #pragma omp declare target #include "common/omptarget.h" @@ -48,8 +49,7 @@ __kmpc_barrier_simple_spmd(loc_ref, tid); } else { tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref)); - int numberOfActiveOMPThreads = - GetNumberOfOmpThreads(checkSPMDMode(loc_ref)); + int numberOfActiveOMPThreads = omp_get_num_threads(); if (numberOfActiveOMPThreads > 1) { if (checkSPMDMode(loc_ref)) { __kmpc_barrier_simple_spmd(loc_ref, tid); diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -56,13 +56,10 @@ DEVICE int GetNumberOfWorkersInTeam(); // get OpenMP thread and team ids -DEVICE int GetOmpThreadId(int threadId, - bool isSPMDExecutionMode); // omp_thread_num -DEVICE int GetOmpTeamId(); // omp_team_num +DEVICE int GetOmpTeamId(); // omp_team_num // get OpenMP number of threads and team -DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads -DEVICE int GetNumberOfOmpTeams(); // omp_num_teams +DEVICE int GetNumberOfOmpTeams(); // omp_num_teams // get OpenMP number of procs DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); @@ -71,10 +68,6 @@ // masters DEVICE int IsTeamMaster(int ompThreadId); -// Parallel level -DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); -DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); - //////////////////////////////////////////////////////////////////////////////// // Memory //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -68,6 +68,8 @@ get_filename_component(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) +set(devicertl_include_directory + ${devicertl_base_directory}/common/include) set(devicertl_common_directory ${devicertl_base_directory}/common) set(devicertl_nvptx_directory @@ -128,6 +130,7 @@ ${devicertl_common_directory}/src/support.cu ${devicertl_common_directory}/src/sync.cu ${devicertl_common_directory}/src/task.cu + ${devicertl_common_directory}/src/ICVs.cpp src/target_impl.cu ) @@ -139,6 +142,8 @@ -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device -D__CUDACC__ -I${devicertl_base_directory} + -I${devicertl_common_directory} + -I${devicertl_include_directory} -I${devicertl_nvptx_directory}/src) if(${LIBOMPTARGET_NVPTX_DEBUG}) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -62,8 +62,6 @@ #endif #endif -#define OMP_ACTIVE_PARALLEL_LEVEL 128 - // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { // The maximum number of workers in a kernel. diff --git a/openmp/libomptarget/test/offloading/omp_get_team_size.c b/openmp/libomptarget/test/offloading/omp_get_team_size.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/omp_get_team_size.c @@ -0,0 +1,36 @@ +// TODO: make this a test case +#include +#include +#define N 12 + +void foo(int *A) { + if (omp_get_thread_num() == N / 2) + { + for (int i = 0; i < 4; ++i) + A[i] = omp_get_team_size(i); + #pragma omp parallel + { + for (int i = 0; i < 4; ++i) + A[i+4] = omp_get_team_size(i); + #pragma omp parallel + { + for (int i = 0; i < 4; ++i) + A[i+8] = omp_get_team_size(i); + } + } + } +} + +int main() { + int A[N]; + for (int i = 0; i < N; ++i) + A[i] = 42424242; + #pragma omp target teams num_teams(1) thread_limit(N) map(tofrom:A[:N]) + { + #pragma omp parallel + foo(A); + } + for (int i = 0; i < N; ++i) + printf("%i : %i\n", i, A[i]); + return 0; +} diff --git a/openmp/libomptarget/test/offloading/omp_set_num_threads.c b/openmp/libomptarget/test/offloading/omp_set_num_threads.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/omp_set_num_threads.c @@ -0,0 +1,45 @@ +// TODO: make this a test case +#include +#include +#define N 12 + +void foo(int *A) { + if (omp_get_thread_num() == N / 2) + { + for (int i = 0; i < 2; ++i) + A[i] = omp_get_max_threads(); + omp_set_num_threads(42); + for (int i = 2; i < 4; ++i) + A[i] = omp_get_max_threads(); + #pragma omp parallel + { + for (int i = 0; i < 2; ++i) + A[i+4] = omp_get_max_threads(); + omp_set_num_threads(4242); + for (int i = 2; i < 4; ++i) + A[i+4] = omp_get_max_threads(); + #pragma omp parallel + { + for (int i = 0; i < 2; ++i) + A[i+8] = omp_get_max_threads(); + omp_set_num_threads(424242); + for (int i = 2; i < 4; ++i) + A[i+8] = omp_get_max_threads(); + } + } + } +} + +int main() { + int A[N]; + for (int i = 0; i < N; ++i) + A[i] = 42424242; + #pragma omp target teams num_teams(1) thread_limit(N) map(tofrom:A[:N]) + { + #pragma omp parallel + foo(A); + } + for (int i = 0; i < N; ++i) + printf("%i : %i\n", i, A[i]); + return 0; +}