diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -21,114 +21,70 @@ static constexpr unsigned MinBytes = 8; -template +template struct alignas(32) ThreadStackTy { - static constexpr unsigned MaxSize = NThreads * BytesPerThread; + static constexpr unsigned BytesPerThread = BPerThread; static constexpr unsigned NumThreads = NThreads; static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE; - static constexpr unsigned MaxSizePerWarp = MaxSize / NumWarps; - unsigned char Data[MaxSize]; - char Sizes[MaxSize / MinBytes]; - char SizeUsage[NumWarps]; - char Usage[NumWarps]; + unsigned char Data[NumThreads][BytesPerThread]; + unsigned char Usage[NumThreads]; }; [[clang::loader_uninitialized]] ThreadStackTy MainSharedStack; #pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc) -[[clang::loader_uninitialized]] ThreadStackTy +[[clang::loader_uninitialized]] ThreadStackTy WorkerSharedStack; #pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc) -template -static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes, - unsigned WarpBytes) { - void *Ptr; - __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); - unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1; - bool IsWarpLeader = - (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID; - if (IsWarpLeader) - Ptr = Alloc(); - // Get address from the first active lane. - int *FP = (int *)&Ptr; - FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], LeaderID); - if (sizeof(Ptr) == 8) - FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], LeaderID); - return (void *)&((char *)(Ptr))[(GetLaneId() - LeaderID) * Bytes]; -} - EXTERN void *__kmpc_alloc_shared(size_t Bytes) { - Bytes = Bytes + (Bytes % MinBytes); + size_t AlignedBytes = Bytes + (Bytes % MinBytes); int TID = __kmpc_get_hardware_thread_id_in_block(); if (__kmpc_is_generic_main_thread(TID)) { // Main thread alone, use shared memory if space is available. - if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) { - void *Ptr = &MainSharedStack.Data[MainSharedStack.Usage[0]]; - MainSharedStack.Usage[0] += Bytes; - MainSharedStack.Sizes[MainSharedStack.SizeUsage[0]++] = Bytes; + if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) { + void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]]; + MainSharedStack.Usage[0] += AlignedBytes; return Ptr; } - } else { - int WID = GetWarpId(); - unsigned WarpBytes = Bytes * WARPSIZE; - auto AllocSharedStack = [&]() { - unsigned WarpOffset = WID * WorkerSharedStack.MaxSizePerWarp; - void *Ptr = - &WorkerSharedStack.Data[WarpOffset + WorkerSharedStack.Usage[WID]]; - WorkerSharedStack.Usage[WID] += WarpBytes; - WorkerSharedStack.Sizes[WorkerSharedStack.SizeUsage[WID]++] = WarpBytes; + } else if (TID < WorkerSharedStack.NumThreads) { + if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) { + void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]]; + WorkerSharedStack.Usage[TID] += AlignedBytes; return Ptr; - }; - if (TID < WorkerSharedStack.NumThreads && - WorkerSharedStack.Usage[WID] + WarpBytes <= - WorkerSharedStack.MaxSizePerWarp) - return __kmpc_alloc_for_warp(AllocSharedStack, Bytes, WarpBytes); + } } // Fallback to malloc - unsigned WarpBytes = Bytes * WARPSIZE; - auto AllocGlobal = [&] { - return SafeMalloc(WarpBytes, "AllocGlobalFallback"); - }; - return __kmpc_alloc_for_warp(AllocGlobal, Bytes, WarpBytes); + return SafeMalloc(Bytes, "AllocGlobalFallback"); } -EXTERN void __kmpc_free_shared(void *Ptr, size_t /* Bytes */) { - __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); - unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1; - bool IsWarpLeader = - (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID; - __kmpc_syncwarp(CurActive); - if (IsWarpLeader) { - if (Ptr >= &MainSharedStack.Data[0] && - Ptr < &MainSharedStack.Data[MainSharedStack.MaxSize]) { - unsigned Bytes = MainSharedStack.Sizes[--MainSharedStack.SizeUsage[0]]; - MainSharedStack.Usage[0] -= Bytes; +EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) { + size_t AlignedBytes = Bytes + (Bytes % MinBytes); + int TID = __kmpc_get_hardware_thread_id_in_block(); + if (__kmpc_is_generic_main_thread(TID)) { + if (Ptr >= &MainSharedStack.Data[0][0] && + Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) { + MainSharedStack.Usage[0] -= AlignedBytes; return; } - if (Ptr >= &WorkerSharedStack.Data[0] && - Ptr < &WorkerSharedStack.Data[WorkerSharedStack.MaxSize]) { - int WID = GetWarpId(); - unsigned Bytes = - WorkerSharedStack.Sizes[--WorkerSharedStack.SizeUsage[WID]]; - WorkerSharedStack.Usage[WID] -= Bytes; + } else if (TID < WorkerSharedStack.NumThreads) { + if (Ptr >= &WorkerSharedStack.Data[0][0] && + Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) { + int TID = __kmpc_get_hardware_thread_id_in_block(); + WorkerSharedStack.Usage[TID] -= AlignedBytes; return; } - SafeFree(Ptr, "FreeGlobalFallback"); } + SafeFree(Ptr, "FreeGlobalFallback"); } EXTERN void __kmpc_data_sharing_init_stack() { - for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i) { - MainSharedStack.SizeUsage[i] = 0; + for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i) MainSharedStack.Usage[i] = 0; - } - for (unsigned i = 0; i < WorkerSharedStack.NumWarps; ++i) { - WorkerSharedStack.SizeUsage[i] = 0; + for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i) WorkerSharedStack.Usage[i] = 0; - } } /// Allocate storage in shared memory to communicate arguments from the main