Index: libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -1287,8 +1287,8 @@ int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); EXTERN void __kmpc_spmd_kernel_deinit(); -EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized); -EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized); +EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized, void ***SharedArgs, int32_t nArgs); +EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized, void ***SharedArgs); EXTERN void __kmpc_kernel_end_parallel(); EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, bool *IsFinal, int32_t *LaneSource); EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer); Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -60,6 +60,46 @@ #define __ACTIVEMASK() __ballot(1) #endif +// arguments needed for L0 parallelism only. +class omptarget_nvptx_SharedArgs { +public: + // All these methods must be called by the master thread only. + INLINE void Init() { + args = buffer; + nArgs = MAX_SHARED_ARGS; + } + INLINE void DeInit() { + // Free any memory allocated for outlined parallel function with a large + // number of arguments. + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + Init(); + } + } + INLINE void EnsureSize(int size) { + if (size > nArgs) { + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + } + args = (void **) SafeMalloc(size * sizeof(void *), + (char *)"new extended args"); + nArgs = size; + } + } + // Called by all threads. + INLINE void **GetArgs() { return args; }; +private: + // buffer of pre-allocated arguments. + void *buffer[MAX_SHARED_ARGS]; + // pointer to arguments buffer. + // starts off as a pointer to 'buffer' but can be dynamically allocated. + void **args; + // starts off as MAX_SHARED_ARGS but can increase in size. + uint32_t nArgs; +}; + +extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs; + // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { // The maximum number of workers in a kernel. Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -32,6 +32,7 @@ // extern volatile __device__ __shared__ omptarget_nvptx_WorkFn omptarget_nvptx_workFn; extern __device__ __shared__ uint32_t execution_param; +__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs; //////////////////////////////////////////////////////////////////////////////// // init entry points @@ -61,6 +62,9 @@ PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", OMPTARGET_NVPTX_VERSION); + // init parallel work arguments + omptarget_nvptx_sharedArgs.Init(); + if (!RequiresOMPRuntime) { // If OMP runtime is not required don't initialize OMP state. setExecutionParameters(Generic, RuntimeUninitialized); @@ -114,6 +118,9 @@ } // Done with work. Kill the workers. omptarget_nvptx_workFn = 0; + + // Deinit parallel work arguments + omptarget_nvptx_sharedArgs.DeInit(); } EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, Index: libomptarget/deviceRTLs/nvptx/src/option.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/option.h +++ libomptarget/deviceRTLs/nvptx/src/option.h @@ -46,6 +46,10 @@ // to synchronize with each other. #define L1_BARRIER (1) +// Maximum number of preallocated arguments to an outlined parallel/simd function. +// Anything more requires dynamic memory allocation. +#define MAX_SHARED_ARGS 20 + // Maximum number of omp state objects per SM allocated statically in global memory. #if __CUDA_ARCH__ >= 600 #define OMP_STATE_COUNT 32 Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -210,10 +210,16 @@ // } // // This routine is always called by the team master.. -EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized) { +EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized, + void ***SharedArgs, int32_t nArgs) { PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); omptarget_nvptx_workFn = WorkFn; + if (nArgs > 0) { + omptarget_nvptx_sharedArgs.EnsureSize(nArgs); + *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs(); + } + if (!IsOMPRuntimeInitialized) return; // This routine is only called by the team master. The team master is @@ -310,11 +316,13 @@ // returns True if this thread is active, else False. // // Only the worker threads call this routine. -EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized) { +EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized, + void ***SharedArgs) { PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); // Work function and arguments for L1 parallel region. *WorkFn = omptarget_nvptx_workFn; + *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs(); if (!IsOMPRuntimeInitialized) return true;