diff --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp @@ -78,7 +78,6 @@ // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini -// CHECK-NOT: call i32 @__kmpc_nvptx_simd_reduce_nowait( // CHECK-NOT: call void @__kmpc_nvptx_end_reduce_nowait( // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -140,8 +140,6 @@ DEVICE unsigned GetWarpId(); DEVICE unsigned GetLaneId(); -DEVICE bool __kmpc_impl_is_first_active_thread(); - // Locks DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock); DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock); diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -200,7 +200,6 @@ INLINE omptarget_nvptx_WorkDescr &WorkDescr() { return workDescrForActiveParallel; } - INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } // init INLINE void InitTeamDescr(); @@ -251,7 +250,6 @@ levelZeroTaskDescr; // icv for team master initial thread omptarget_nvptx_WorkDescr workDescrForActiveParallel; // one, ONLY for the active par - uint64_t lastprivateIterBuffer; ALIGN(16) __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE]; @@ -277,10 +275,6 @@ INLINE uint16_t &NumThreadsForNextParallel(int tid) { return nextRegion.tnum[tid]; } - // simd - INLINE uint16_t &SimdLimitForNextSimd(int tid) { - return nextRegion.slim[tid]; - } // schedule (for dispatch) INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } INLINE int64_t &Chunk(int tid) { return chunk[tid]; } @@ -304,8 +298,6 @@ // Only one of the two is live at the same time. // parallel uint16_t tnum[MAX_THREADS_PER_TEAM]; - // simd limit - uint16_t slim[MAX_THREADS_PER_TEAM]; } nextRegion; // schedule (for dispatch) kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -17,297 +17,6 @@ return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock(); } -/// Return the provided size aligned to the size of a pointer. -INLINE static size_t AlignVal(size_t Val) { - const size_t Align = (size_t)sizeof(void *); - if (Val & (Align - 1)) { - Val += Align; - Val &= ~(Align - 1); - } - return Val; -} - -#define DSFLAG 0 -#define DSFLAG_INIT 0 -#define DSPRINT(_flag, _str, _args...) \ - { \ - if (_flag) { \ - /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/ \ - } \ - } -#define DSPRINT0(_flag, _str) \ - { \ - if (_flag) { \ - /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/ \ - } \ - } - -// Initialize the shared data structures. This is expected to be called for the -// master thread and warp masters. \param RootS: A pointer to the root of the -// data sharing stack. \param InitialDataSize: The initial size of the data in -// the slot. -EXTERN void -__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS, - size_t InitialDataSize) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - DSPRINT0(DSFLAG_INIT, - "Entering __kmpc_initialize_data_sharing_environment\n"); - - unsigned WID = GetWarpId(); - DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID); - - omptarget_nvptx_TeamDescr *teamDescr = - &omptarget_nvptx_threadPrivateContext->TeamContext(); - __kmpc_data_sharing_slot *RootS = - teamDescr->RootS(WID, IsMasterThread(isSPMDMode())); - - DataSharingState.SlotPtr[WID] = RootS; - DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; - - // We don't need to initialize the frame and active threads. - - DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize); - DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS); - DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n", - (unsigned long long)RootS->DataEnd); - DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n", - (unsigned long long)RootS->Next); - DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n", - (unsigned long long)DataSharingState.SlotPtr[WID]); - DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n", - (unsigned long long)DataSharingState.StackPtr[WID]); - - DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n"); -} - -EXTERN void *__kmpc_data_sharing_environment_begin( - __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, - void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads, - size_t SharingDataSize, size_t SharingDefaultDataSize, - int16_t IsOMPRuntimeInitialized) { - - DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n"); - - // If the runtime has been elided, used shared memory for master-worker - // data sharing. - if (!IsOMPRuntimeInitialized) - return (void *)&DataSharingState; - - DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize); - DSPRINT(DSFLAG, "Default Data Size %016llx\n", - (unsigned long long)SharingDefaultDataSize); - - unsigned WID = GetWarpId(); - __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask(); - - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - void *&StackP = DataSharingState.StackPtr[WID]; - void * volatile &FrameP = DataSharingState.FramePtr[WID]; - __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID]; - - DSPRINT0(DSFLAG, "Save current slot/stack values.\n"); - // Save the current values. - *SavedSharedSlot = SlotP; - *SavedSharedStack = StackP; - *SavedSharedFrame = FrameP; - *SavedActiveThreads = ActiveT; - - DSPRINT(DSFLAG, "Warp ID: %u\n", WID); - DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP); - DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP); - DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP); - DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT); - - // Only the warp active master needs to grow the stack. - if (__kmpc_impl_is_first_active_thread()) { - // Save the current active threads. - ActiveT = CurActiveThreads; - - // Make sure we use aligned sizes to avoid rematerialization of data. - SharingDataSize = AlignVal(SharingDataSize); - // FIXME: The default data size can be assumed to be aligned? - SharingDefaultDataSize = AlignVal(SharingDefaultDataSize); - - // Check if we have room for the data in the current slot. - const uintptr_t CurrentStartAddress = (uintptr_t)StackP; - const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd; - const uintptr_t RequiredEndAddress = - CurrentStartAddress + (uintptr_t)SharingDataSize; - - DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize); - DSPRINT(DSFLAG, "Default Data Size %016llx\n", - (unsigned long long)SharingDefaultDataSize); - DSPRINT(DSFLAG, "Current Start Address %016llx\n", - (unsigned long long)CurrentStartAddress); - DSPRINT(DSFLAG, "Current End Address %016llx\n", - (unsigned long long)CurrentEndAddress); - DSPRINT(DSFLAG, "Required End Address %016llx\n", - (unsigned long long)RequiredEndAddress); - DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT); - - // If we require a new slot, allocate it and initialize it (or attempt to - // reuse one). Also, set the shared stack and slot pointers to the new - // place. If we do not need to grow the stack, just adapt the stack and - // frame pointers. - if (CurrentEndAddress < RequiredEndAddress) { - size_t NewSize = (SharingDataSize > SharingDefaultDataSize) - ? SharingDataSize - : SharingDefaultDataSize; - __kmpc_data_sharing_slot *NewSlot = 0; - - // Attempt to reuse an existing slot. - if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) { - uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd - - (uintptr_t)(&ExistingSlot->Data[0]); - if (ExistingSlotSize >= NewSize) { - DSPRINT(DSFLAG, "Reusing stack slot %016llx\n", - (unsigned long long)ExistingSlot); - NewSlot = ExistingSlot; - } else { - DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n", - (unsigned long long)SlotP->Next); - SafeFree(ExistingSlot, "Failed reuse"); - } - } - - if (!NewSlot) { - NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc( - sizeof(__kmpc_data_sharing_slot) + NewSize, - "Warp master slot allocation"); - DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n", - (unsigned long long)NewSlot, NewSize); - } - - NewSlot->Next = 0; - NewSlot->DataEnd = &NewSlot->Data[NewSize]; - - SlotP->Next = NewSlot; - SlotP = NewSlot; - StackP = &NewSlot->Data[SharingDataSize]; - FrameP = &NewSlot->Data[0]; - } else { - - // Clean up any old slot that we may still have. The slot producers, do - // not eliminate them because that may be used to return data. - if (SlotP->Next) { - DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n", - (unsigned long long)SlotP->Next); - SafeFree(SlotP->Next, "Old slot not required"); - SlotP->Next = 0; - } - - FrameP = StackP; - StackP = (void *)RequiredEndAddress; - } - } - - // FIXME: Need to see the impact of doing it here. - __kmpc_impl_threadfence_block(); - - DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n"); - - // All the threads in this warp get the frame they should work with. - return FrameP; -} - -EXTERN void __kmpc_data_sharing_environment_end( - __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, - void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads, - int32_t IsEntryPoint) { - - DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n"); - - unsigned WID = GetWarpId(); - - if (IsEntryPoint) { - if (__kmpc_impl_is_first_active_thread()) { - DSPRINT0(DSFLAG, "Doing clean up\n"); - - // The master thread cleans the saved slot, because this is an environment - // only for the master. - __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode()) - ? *SavedSharedSlot - : DataSharingState.SlotPtr[WID]; - - if (S->Next) { - SafeFree(S->Next, "Sharing environment end"); - S->Next = 0; - } - } - - DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n"); - return; - } - - __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); - - // Only the warp master can restore the stack and frame information, and only - // if there are no other threads left behind in this environment (i.e. the - // warp diverged and returns in different places). This only works if we - // assume that threads will converge right after the call site that started - // the environment. - if (__kmpc_impl_is_first_active_thread()) { - __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID]; - - DSPRINT0(DSFLAG, "Before restoring the stack\n"); - // Zero the bits in the mask. If it is still different from zero, then we - // have other threads that will return after the current ones. - ActiveT &= ~CurActive; - - DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n", - (unsigned)CurActive, (unsigned)ActiveT); - - if (!ActiveT) { - // No other active threads? Great, lets restore the stack. - - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - void *&StackP = DataSharingState.StackPtr[WID]; - void * volatile &FrameP = DataSharingState.FramePtr[WID]; - - SlotP = *SavedSharedSlot; - StackP = *SavedSharedStack; - FrameP = *SavedSharedFrame; - ActiveT = *SavedActiveThreads; - - DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n", - (unsigned long long)SlotP); - DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n", - (unsigned long long)StackP); - DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n", - (unsigned long long)FrameP); - DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT); - } - } - - // FIXME: Need to see the impact of doing it here. - __kmpc_impl_threadfence_block(); - - DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n"); - return; -} - -EXTERN void * -__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID, - int16_t IsOMPRuntimeInitialized) { - DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n"); - - // If the runtime has been elided, use shared memory for master-worker - // data sharing. We're reusing the statically allocated data structure - // that is used for standard data sharing. - if (!IsOMPRuntimeInitialized) - return (void *)&DataSharingState; - - // Get the frame used by the requested thread. - - unsigned SourceWID = SourceThreadID / WARPSIZE; - - DSPRINT(DSFLAG, "Source warp: %u\n", SourceWID); - - void * volatile P = DataSharingState.FramePtr[SourceWID]; - DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); - return P; -} - //////////////////////////////////////////////////////////////////////////////// // Runtime functions for trunk data sharing scheme. //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -362,53 +362,3 @@ PRINT(LD_IO, "call omp_test_lock() return %d\n", rc); return rc; } - -// for xlf Fortran -// Fortran, the return is LOGICAL type - -#define FLOGICAL long -EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() { - int ret = omp_is_initial_device(); - if (ret == 0) - return (FLOGICAL)0; - else - return (FLOGICAL)1; -} - -EXTERN int __xlf_omp_is_initial_device_i4() { - int ret = omp_is_initial_device(); - if (ret == 0) - return 0; - else - return 1; -} - -EXTERN long __xlf_omp_get_team_num_i4() { - int ret = omp_get_team_num(); - return (long)ret; -} - -EXTERN long __xlf_omp_get_num_teams_i4() { - int ret = omp_get_num_teams(); - return (long)ret; -} - -EXTERN void xlf_debug_print_int(int *p) { - printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p); -} - -EXTERN void xlf_debug_print_long(long *p) { - printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p); -} - -EXTERN void xlf_debug_print_float(float *p) { - printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p); -} - -EXTERN void xlf_debug_print_double(double *p) { - printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p); -} - -EXTERN void xlf_debug_print_addr(void *p) { - printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p); -} diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -754,55 +754,3 @@ EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) { PRINT0(LD_IO, "call kmpc_for_static_fini\n"); } - -namespace { -INLINE void syncWorkersInGenericMode(uint32_t NumThreads) { - int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE); -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - // On Volta and newer architectures we require that all lanes in - // a warp (at least, all present for the kernel launch) participate in the - // barrier. This is enforced when launching the parallel region. An - // exception is when there are < WARPSIZE workers. In this case only 1 worker - // is started, so we don't need a barrier. - if (NumThreads > 1) { -#endif - __kmpc_impl_named_sync(L1_BARRIER, WARPSIZE * NumWarps); -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - } -#endif -} -}; // namespace - -EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid, - int32_t varNum, void *array) { - PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n"); - ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), - "Expected non-SPMD mode + initialized runtime."); - - omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); - uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc)); - uint64_t *Buffer = teamDescr.getLastprivateIterBuffer(); - for (unsigned i = 0; i < varNum; i++) { - // Reset buffer. - if (gtid == 0) - *Buffer = 0; // Reset to minimum loop iteration value. - - // Barrier. - syncWorkersInGenericMode(NumThreads); - - // Atomic max of iterations. - uint64_t *varArray = (uint64_t *)array; - uint64_t elem = varArray[i]; - (void)__kmpc_atomic_max((unsigned long long int *)Buffer, - (unsigned long long int)elem); - - // Barrier. - syncWorkersInGenericMode(NumThreads); - - // Read max value and update thread private array. - varArray[i] = *Buffer; - - // Barrier. - syncWorkersInGenericMode(NumThreads); - } -} diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -25,13 +25,6 @@ // init entry points //////////////////////////////////////////////////////////////////////////////// -EXTERN void __kmpc_kernel_init_params(void *Ptr) { - PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n", - OMPTARGET_NVPTX_VERSION); - - SetTeamsReductionScratchpadPtr(Ptr); -} - EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", OMPTARGET_NVPTX_VERSION); @@ -152,10 +145,6 @@ } } -EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() { - __kmpc_spmd_kernel_deinit_v2(isRuntimeInitialized()); -} - EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) { // We're not going to pop the task descr stack of each thread since // there are no more parallel regions in SPMD mode. diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -35,161 +35,6 @@ #include "common/omptarget.h" #include "target_impl.h" -typedef struct ConvergentSimdJob { - omptarget_nvptx_TaskDescr taskDescr; - omptarget_nvptx_TaskDescr *convHeadTaskDescr; - uint16_t slimForNextSimd; -} ConvergentSimdJob; - -//////////////////////////////////////////////////////////////////////////////// -// support for convergent simd (team of threads in a warp only) -//////////////////////////////////////////////////////////////////////////////// -EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, - __kmpc_impl_lanemask_t Mask, - bool *IsFinal, int32_t *LaneSource, - int32_t *LaneId, int32_t *NumLanes) { - PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n"); - __kmpc_impl_lanemask_t ConvergentMask = Mask; - int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask); - __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); - *LaneSource += __kmpc_impl_ffs(WorkRemaining); - *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1; - __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); - *LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt); - - int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); - int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; - - ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; - int32_t SimdLimit = - omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId); - job->slimForNextSimd = SimdLimit; - - int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource); - // reset simdlimit to avoid propagating to successive #simd - if (SimdLimitSource > 0 && threadId == sourceThreadId) - omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0; - - // We cannot have more than the # of convergent threads. - if (SimdLimitSource > 0) - *NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource); - else - *NumLanes = ConvergentSize; - ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads", - (int)*NumLanes); - - // Set to true for lanes participating in the simd region. - bool isActive = false; - // Initialize state for active threads. - if (*LaneId < *NumLanes) { - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - omptarget_nvptx_TaskDescr *sourceTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( - sourceThreadId); - job->convHeadTaskDescr = currTaskDescr; - // install top descriptor from the thread for which the lanes are working. - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, - sourceTaskDescr); - isActive = true; - } - - // requires a memory fence between threads of a warp - return isActive; -} - -EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) { - PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); - // pop stack - int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); - ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; - omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = - job->slimForNextSimd; - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( - threadId, job->convHeadTaskDescr); -} - -typedef struct ConvergentParallelJob { - omptarget_nvptx_TaskDescr taskDescr; - omptarget_nvptx_TaskDescr *convHeadTaskDescr; - uint16_t tnumForNextPar; -} ConvergentParallelJob; - -//////////////////////////////////////////////////////////////////////////////// -// support for convergent parallelism (team of threads in a warp only) -//////////////////////////////////////////////////////////////////////////////// -EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, - __kmpc_impl_lanemask_t Mask, - bool *IsFinal, - int32_t *LaneSource) { - PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n"); - __kmpc_impl_lanemask_t ConvergentMask = Mask; - int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask); - __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); - *LaneSource += __kmpc_impl_ffs(WorkRemaining); - *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1; - __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); - uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt); - - int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); - int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; - - ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; - int32_t NumThreadsClause = - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); - job->tnumForNextPar = NumThreadsClause; - - int32_t NumThreadsSource = - __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource); - // reset numthreads to avoid propagating to successive #parallel - if (NumThreadsSource > 0 && threadId == sourceThreadId) - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = - 0; - - // We cannot have more than the # of convergent threads. - uint16_t NumThreads; - if (NumThreadsSource > 0) - NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource); - else - NumThreads = ConvergentSize; - ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", - (int)NumThreads); - - // Set to true for workers participating in the parallel region. - bool isActive = false; - // Initialize state for active threads. - if (OmpId < NumThreads) { - // init L2 task descriptor and storage for the L1 parallel task descriptor. - omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr; - ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - omptarget_nvptx_TaskDescr *sourceTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( - sourceThreadId); - job->convHeadTaskDescr = currTaskDescr; - newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads); - // install new top descriptor - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, - newTaskDescr); - isActive = true; - } - - // requires a memory fence between threads of a warp - return isActive; -} - -EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) { - PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); - // pop stack - int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); - ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( - threadId, job->convHeadTaskDescr); - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = - job->tnumForNextPar; -} - //////////////////////////////////////////////////////////////////////////////// // support for parallel that goes parallel (1 static level only) //////////////////////////////////////////////////////////////////////////////// @@ -446,14 +291,6 @@ num_threads; } -EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid, - int32_t simd_limit) { - PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit); - ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); - tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit; -} - // Do nothing. The host guarantees we started the requested number of // teams and we only need inspection of gridDim. diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -73,22 +73,6 @@ return (logical_lane_id == 0); } -EXTERN -int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars, - size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct) { - __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); - if (Liveness == __kmpc_impl_all_lanes) { - gpu_regular_warp_reduce(reduce_data, shflFct); - return GetThreadIdInBlock() % WARPSIZE == - 0; // Result on lane 0 of the simd warp. - } else { - return gpu_irregular_simd_reduce( - reduce_data, shflFct); // Result on the first active lane. - } -} - INLINE static int32_t nvptx_parallel_reduce_nowait( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, @@ -177,14 +161,6 @@ #endif // __CUDA_ARCH__ >= 700 } -EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, - reduce_data, shflFct, cpyFct, - isSPMDMode(), isRuntimeUninitialized()); -} - EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, @@ -195,201 +171,6 @@ checkSPMDMode(loc), checkRuntimeUninitialized(loc)); } -EXTERN -int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - return nvptx_parallel_reduce_nowait( - global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, - /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true); -} - -EXTERN -int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - return nvptx_parallel_reduce_nowait( - global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, - /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true); -} - -INLINE -static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars, - size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr scratchFct, - kmp_LoadReduceFctPtr ldFct, - bool isSPMDExecutionMode) { - uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - // In non-generic mode all workers participate in the teams reduction. - // In generic mode only the team master participates in the teams - // reduction because the workers are waiting for parallel work. - uint32_t NumThreads = - isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true) - : /*Master thread only*/ 1; - uint32_t TeamId = GetBlockIdInKernel(); - uint32_t NumTeams = GetNumberOfBlocksInKernel(); - static SHARED volatile bool IsLastTeam; - - // Team masters of all teams write to the scratchpad. - if (ThreadId == 0) { - unsigned int *timestamp = GetTeamsReductionTimestamp(); - char *scratchpad = GetTeamsReductionScratchpad(); - - scratchFct(reduce_data, scratchpad, TeamId, NumTeams); - __kmpc_impl_threadfence(); - - // atomicInc increments 'timestamp' and has a range [0, NumTeams-1]. - // It resets 'timestamp' back to 0 once the last team increments - // this counter. - unsigned val = __kmpc_atomic_inc(timestamp, NumTeams - 1); - IsLastTeam = val == NumTeams - 1; - } - - // We have to wait on L1 barrier because in GENERIC mode the workers - // are waiting on barrier 0 for work. - // - // If we guard this barrier as follows it leads to deadlock, probably - // because of a compiler bug: if (!IsGenericMode()) __syncthreads(); - uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE; - __kmpc_impl_named_sync(L1_BARRIER, SyncWarps * WARPSIZE); - - // If this team is not the last, quit. - if (/* Volatile read by all threads */ !IsLastTeam) - return 0; - - // - // Last team processing. - // - - // Threads in excess of #teams do not participate in reduction of the - // scratchpad values. -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - uint32_t ActiveThreads = NumThreads; - if (NumTeams < NumThreads) { - ActiveThreads = - (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1); - } - if (ThreadId >= ActiveThreads) - return 0; - - // Load from scratchpad and reduce. - char *scratchpad = GetTeamsReductionScratchpad(); - ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); - for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads) - ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); - - uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; - uint32_t WarpId = ThreadId / WARPSIZE; - - // Reduce across warps to the warp master. - if ((ActiveThreads % WARPSIZE == 0) || - (WarpId < WarpsNeeded - 1)) // Full warp - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (ActiveThreads > 1) // Partial warp but contiguous lanes - // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/ActiveThreads % WARPSIZE, - /*LaneId=*/ThreadId % WARPSIZE); - - // When we have more than [warpsize] number of threads - // a block reduction is performed here. - if (ActiveThreads > WARPSIZE) { - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); - } -#else - if (ThreadId >= NumTeams) - return 0; - - // Load from scratchpad and reduce. - char *scratchpad = GetTeamsReductionScratchpad(); - ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); - for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads) - ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); - - // Reduce across warps to the warp master. - __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); - if (Liveness == __kmpc_impl_all_lanes) // Full warp - gpu_regular_warp_reduce(reduce_data, shflFct); - else // Partial warp but contiguous lanes - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/__kmpc_impl_popc(Liveness), - /*LaneId=*/ThreadId % WARPSIZE); - - // When we have more than [warpsize] number of threads - // a block reduction is performed here. - uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads; - if (ActiveThreads > WARPSIZE) { - uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = ThreadId / WARPSIZE; - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); - } -#endif // __CUDA_ARCH__ >= 700 - - return ThreadId == 0; -} - -EXTERN -int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars, - size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr scratchFct, - kmp_LoadReduceFctPtr ldFct) { - return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, - reduce_data, shflFct, cpyFct, scratchFct, - ldFct, isSPMDMode()); -} - -EXTERN -int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { - return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, - reduce_data, shflFct, cpyFct, scratchFct, - ldFct, /*isSPMDExecutionMode=*/true); -} - -EXTERN -int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { - return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, - reduce_data, shflFct, cpyFct, scratchFct, - ldFct, /*isSPMDExecutionMode=*/false); -} - -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc, - int32_t global_tid, - kmp_CriticalName *crit) { - if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0) - return 0; - // The master thread of the team actually does the reduction. - while (__kmpc_atomic_cas((uint32_t *)crit, 0u, 1u)) - ; - return 1; -} - -EXTERN void -__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid, - kmp_CriticalName *crit) { - __kmpc_impl_threadfence_system(); - (void)__kmpc_atomic_exchange((uint32_t *)crit, 0u); -} - INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) { return checkGenericMode(loc) || IsTeamMaster(ThreadId); } diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -264,6 +264,3 @@ return static_cast(ReductionScratchpadPtr) + 256; } -DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) { - ReductionScratchpadPtr = ScratchpadPtr; -} diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu --- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu @@ -79,23 +79,6 @@ PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n"); } -// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0 -// parallel region and that all worker threads participate. -EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) { - int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE; - // The #threads parameter must be rounded up to the WARPSIZE. - int threads = - WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); - - PRINT(LD_SYNC, - "call kmpc_barrier_simple_generic with %d omp threads, sync parameter " - "%d\n", - (int)numberOfActiveOMPThreads, (int)threads); - // Barrier #1 is for synchronization among active threads. - __kmpc_impl_named_sync(L1_BARRIER, threads); - PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n"); -} - //////////////////////////////////////////////////////////////////////////////// // KMP MASTER //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -94,6 +94,5 @@ //////////////////////////////////////////////////////////////////////////////// DEVICE unsigned int *GetTeamsReductionTimestamp(); DEVICE char *GetTeamsReductionScratchpad(); -DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr); #endif diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -193,17 +193,10 @@ // parallel defs typedef ident_t kmp_Ident; -typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...); -typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData); typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num); typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t lane_offset, int16_t shortCircuit); -typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad, - int32_t index, int32_t width); -typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad, - int32_t index, int32_t width, - int32_t reduce); typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data); // task defs @@ -227,12 +220,6 @@ EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc); EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid, int32_t num_threads); -// simd -EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid, - int32_t simd_limit); -// aee ... not supported -// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr -// microtask, ...); EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid); EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, uint32_t global_tid); @@ -354,61 +341,25 @@ EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid); EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid); -// Support for reducing conditional lastprivate variables -EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, - int32_t global_tid, - int32_t varNum, void *array); - // reduction EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid); EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); -EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); -EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); -EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); -EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( kmp_Ident *loc, int32_t global_tid, void *global_buffer, int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct, kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct, kmp_ListGlobalFctPtr glredFct); -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc, - int32_t global_tid, - kmp_CriticalName *crit); -EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, - int32_t global_tid, - kmp_CriticalName *crit); EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); // sync barrier EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid); EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid); -EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid); EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid); // single @@ -468,29 +419,16 @@ int32_t cancelVal); // non standard -EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr); EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime); EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); -EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit(); EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized); EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized); EXTERN void __kmpc_kernel_end_parallel(); -EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, - __kmpc_impl_lanemask_t Mask, - bool *IsFinal, - int32_t *LaneSource); -EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer); -EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, - __kmpc_impl_lanemask_t Mask, - bool *IsFinal, int32_t *LaneSource, - int32_t *LaneId, int32_t *NumLanes); -EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer); - EXTERN void __kmpc_data_sharing_init_stack(); EXTERN void __kmpc_data_sharing_init_stack_spmd(); @@ -512,22 +450,6 @@ void *DataEnd; char Data[]; }; -EXTERN void -__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS, - size_t InitialDataSize); -EXTERN void *__kmpc_data_sharing_environment_begin( - __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, - void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads, - size_t SharingDataSize, size_t SharingDefaultDataSize, - int16_t IsOMPRuntimeInitialized); -EXTERN void __kmpc_data_sharing_environment_end( - __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, - void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads, - int32_t IsEntryPoint); - -EXTERN void * -__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID, - int16_t IsOMPRuntimeInitialized); // SPMD execution mode interrogation function. EXTERN int8_t __kmpc_is_spmd_exec_mode(); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -195,15 +195,6 @@ INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } -// Return true if this is the first active thread in the warp. -INLINE bool __kmpc_impl_is_first_active_thread() { - unsigned long long Mask = __kmpc_impl_activemask(); - unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE); - unsigned long long Sh = Mask << ShNum; - // Truncate Sh to the 32 lower bits - return (unsigned)Sh == 0; -} - // Locks EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock); EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);