diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -40,10 +40,6 @@ #define WARPSIZE 64 -// Maximum number of preallocated arguments to an outlined parallel/simd -// function. Anything more requires dynamic memory allocation. -#define MAX_SHARED_ARGS 20 - // Maximum number of omp state objects per SM allocated statically in global // memory. #define OMP_STATE_COUNT 32 diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -35,46 +35,6 @@ #define BARRIER_COUNTER 0 #define ORDERED_COUNTER 1 -// arguments needed for L0 parallelism only. -class omptarget_nvptx_SharedArgs { -public: - // All these methods must be called by the master thread only. - INLINE void Init() { - args = buffer; - nArgs = MAX_SHARED_ARGS; - } - INLINE void DeInit() { - // Free any memory allocated for outlined parallel function with a large - // number of arguments. - if (nArgs > MAX_SHARED_ARGS) { - SafeFree(args, "new extended args"); - Init(); - } - } - INLINE void EnsureSize(size_t size) { - if (size > nArgs) { - if (nArgs > MAX_SHARED_ARGS) { - SafeFree(args, "new extended args"); - } - args = (void **)SafeMalloc(size * sizeof(void *), "new extended args"); - nArgs = size; - } - } - // Called by all threads. - INLINE void **GetArgs() const { return args; }; - -private: - // buffer of pre-allocated arguments. - void *buffer[MAX_SHARED_ARGS]; - // pointer to arguments buffer. - // starts off as a pointer to 'buffer' but can be dynamically allocated. - void **args; - // starts off as MAX_SHARED_ARGS but can increase in size. - uint32_t nArgs; -}; - -extern omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs); - // Worker slot type which is initialized with the default worker slot // size of 4*32 bytes. struct __kmpc_data_sharing_slot { diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -135,14 +135,32 @@ } } +/// Allocate storage in shared memory to communicate arguments from the main +/// thread to the workers in generic mode. If we exceed +/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. +#define NUM_SHARED_VARIABLES_IN_SHARED_MEM 64 + +[[clang::loader_uninitialized]] static void + *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; +#pragma omp allocate(SharedMemVariableSharingSpace) \ + allocator(omp_pteam_mem_alloc) +[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr; +#pragma omp allocate(SharedMemVariableSharingSpacePtr) \ + allocator(omp_pteam_mem_alloc) + // Begin a data sharing context. Maintain a list of references to shared // variables. This list of references to shared variables will be passed // to one or more threads. // In L0 data sharing this is called by master thread. // In L1 data sharing this is called by active warp master thread. EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) { - omptarget_nvptx_globalArgs.EnsureSize(nArgs); - *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); + if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { + SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; + } else { + SharedMemVariableSharingSpacePtr = + (void **)SafeMalloc(nArgs * sizeof(void *), "new extended args"); + } + *GlobalArgs = SharedMemVariableSharingSpacePtr; } // End a data sharing context. There is no need to have a list of refs @@ -152,7 +170,8 @@ // In L0 data sharing this is called by master thread. // In L1 data sharing this is called by active warp master thread. EXTERN void __kmpc_end_sharing_variables() { - omptarget_nvptx_globalArgs.DeInit(); + if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) + SafeFree(SharedMemVariableSharingSpacePtr, "new extended args"); } // This function will return a list of references to global variables. This @@ -161,7 +180,7 @@ // preserving the order. // Called by all workers. EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) { - *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); + *GlobalArgs = SharedMemVariableSharingSpacePtr; } // This function is used to init static memory manager. This manager is used to diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu @@ -62,9 +62,4 @@ //////////////////////////////////////////////////////////////////////////////// void *SHARED(ReductionScratchpadPtr); -//////////////////////////////////////////////////////////////////////////////// -// Data sharing related variables. -//////////////////////////////////////////////////////////////////////////////// -omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs); - #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -68,8 +68,6 @@ nThreads = GetNumberOfWorkersInTeam(); threadLimit = nThreads; - omptarget_nvptx_globalArgs.Init(); - __kmpc_data_sharing_init_stack(); __kmpc_impl_target_init(); } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -33,10 +33,6 @@ #define WARPSIZE 32 -// Maximum number of preallocated arguments to an outlined parallel/simd -// function. Anything more requires dynamic memory allocation. -#define MAX_SHARED_ARGS 20 - // Maximum number of omp state objects per SM allocated statically in global // memory. #if __CUDA_ARCH__ >= 600