Index: libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -470,8 +470,9 @@ int16_t RequiresDataSharing); EXTERN void __kmpc_spmd_kernel_deinit(); EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, + void ***SharedArgs, int32_t nArgs, int16_t IsOMPRuntimeInitialized); -EXTERN bool __kmpc_kernel_parallel(void **WorkFn, +EXTERN bool __kmpc_kernel_parallel(void **WorkFn, void ***SharedArgs, int16_t IsOMPRuntimeInitialized); EXTERN void __kmpc_kernel_end_parallel(); EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, Index: libomptarget/deviceRTLs/nvptx/src/omp_data.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -46,3 +46,8 @@ // Scratchpad for teams reduction. //////////////////////////////////////////////////////////////////////////////// __device__ __shared__ void *ReductionScratchpadPtr; + +//////////////////////////////////////////////////////////////////////////////// +// Data sharing related variables. +//////////////////////////////////////////////////////////////////////////////// +__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs; Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -62,6 +62,46 @@ #define __ACTIVEMASK() __ballot(1) #endif +// arguments needed for L0 parallelism only. +class omptarget_nvptx_SharedArgs { +public: + // All these methods must be called by the master thread only. + INLINE void Init() { + args = buffer; + nArgs = MAX_SHARED_ARGS; + } + INLINE void DeInit() { + // Free any memory allocated for outlined parallel function with a large + // number of arguments. + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + Init(); + } + } + INLINE void EnsureSize(int size) { + if (size > nArgs) { + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + } + args = (void **) SafeMalloc(size * sizeof(void *), + (char *)"new extended args"); + nArgs = size; + } + } + // Called by all threads. + INLINE void **GetArgs() { return args; }; +private: + // buffer of pre-allocated arguments. + void *buffer[MAX_SHARED_ARGS]; + // pointer to arguments buffer. + // starts off as a pointer to 'buffer' but can be dynamically allocated. + void **args; + // starts off as MAX_SHARED_ARGS but can increase in size. + uint32_t nArgs; +}; + +extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs; + // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { // The maximum number of workers in a kernel. Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -54,6 +54,9 @@ PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", OMPTARGET_NVPTX_VERSION); + // init parallel work arguments + omptarget_nvptx_sharedArgs.Init(); + if (!RequiresOMPRuntime) { // If OMP runtime is not required don't initialize OMP state. setExecutionParameters(Generic, RuntimeUninitialized); @@ -107,6 +110,9 @@ } // Done with work. Kill the workers. omptarget_nvptx_workFn = 0; + + // Deinit parallel work arguments + omptarget_nvptx_sharedArgs.DeInit(); } EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, Index: libomptarget/deviceRTLs/nvptx/src/option.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/option.h +++ libomptarget/deviceRTLs/nvptx/src/option.h @@ -28,6 +28,10 @@ // region to synchronize with each other. #define L1_BARRIER (1) +// Maximum number of preallocated arguments to an outlined parallel/simd function. +// Anything more requires dynamic memory allocation. +#define MAX_SHARED_ARGS 20 + // Maximum number of omp state objects per SM allocated statically in global // memory. #if __CUDA_ARCH__ >= 600 Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -214,10 +214,16 @@ // // This routine is always called by the team master.. EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, + void ***SharedArgs, int32_t nArgs, int16_t IsOMPRuntimeInitialized) { PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); omptarget_nvptx_workFn = WorkFn; + if (nArgs > 0) { + omptarget_nvptx_sharedArgs.EnsureSize(nArgs); + *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs(); + } + if (!IsOMPRuntimeInitialized) return; @@ -317,11 +323,13 @@ // // Only the worker threads call this routine. EXTERN bool __kmpc_kernel_parallel(void **WorkFn, + void ***SharedArgs, int16_t IsOMPRuntimeInitialized) { PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); // Work function and arguments for L1 parallel region. *WorkFn = omptarget_nvptx_workFn; + *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs(); if (!IsOMPRuntimeInitialized) return true;