Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -322,3 +322,162 @@ DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); return P; } + +//////////////////////////////////////////////////////////////////////////////// +// Runtime functions for trunk data sharing scheme. +//////////////////////////////////////////////////////////////////////////////// + +// Initialize data sharing data structure. This function needs to be called +// once at the beginning of a data sharing context (coincides with the kernel +// initialization). +EXTERN void __kmpc_data_sharing_init_stack() { + // This function initializes the stack pointer with the pointer to the + // statically allocated shared memory slots. The size of a shared memory + // slot is pre-determined to be 256 bytes. + unsigned WID = getWarpId(); + omptarget_nvptx_TeamDescr *teamDescr = + &omptarget_nvptx_threadPrivateContext->TeamContext(); + __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID); + + DataSharingState.SlotPtr[WID] = RootS; + DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; + + // We initialize the list of references to arguments here. + omptarget_nvptx_globalArgs.Init(); +} + +// Called at the time of the kernel initialization. This is used to initilize +// the list of references to shared variables and to pre-allocate global storage +// for holding the globalized variables. +// +// By default the globalized variables are stored in global memory. If the +// UseSharedMemory is set to true, the runtime will attempt to use shared memory +// as long as the size requested fits the pre-allocated size. +// +// TODO: allow more than one push per slot to save on calls to malloc. +// Currently there is only one slot for each push so the data size in the slot +// is the same size as the size being requested. +// +// Called by: master, TODO: call by workers +EXTERN void* __kmpc_data_sharing_push_stack(size_t size, + int16_t UseSharedMemory) { + // TODO: Add shared memory support. For now, use global memory only for + // storing the data sharing slots so ignore the pre-allocated + // shared memory slot. + + // Use global memory for storing the stack. + if (IsMasterThread()) { + unsigned WID = getWarpId(); + + // SlotP will point to either the shared memory slot or an existing + // global memory slot. + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + __kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID]; + + // The slot for holding the data we are pushing. + __kmpc_data_sharing_slot *NewSlot = 0; + size_t NewSize = size; + + // Check if there is a next slot. + if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) { + // Attempt to re-use an existing slot provided the data fits in the slot. + // The leftover data space will not be used. + ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd - + (uintptr_t)(&ExistingSlot->Data[0]); + if (ExistingSlotSize >= NewSize) + NewSlot = ExistingSlot; + else + free(ExistingSlot); + } + + if (!NewSlot) { + NewSlot = (__kmpc_data_sharing_slot *)malloc( + sizeof(__kmpc_data_sharing_slot) + NewSize); + NewSlot->Next = 0; + NewSlot->Prev = SlotP; + + // This is the last slot, save it. + TailSlotP = NewSlot; + } + + NewSlot->DataEnd = &NewSlot->Data[NewSize]; + + SlotP->Next = NewSlot; + SlotP = NewSlot; + + return (void*)&SlotP->Data[0]; + } + + // TODO: add memory fence here when this function can be called by + // worker threads also. For now, this function is only called by the + // master thread of each team. + + // TODO: implement sharing across workers. + return 0; +} + +// Pop the stack and free any memory which can be reclaimed. +// +// When the pop operation removes the last global memory slot, +// reclaim all outstanding global memory slots since it is +// likely we have reached the end of the kernel. +EXTERN void __kmpc_data_sharing_pop_stack(void *a) { + if (IsMasterThread()) { + unsigned WID = getWarpId(); + + __kmpc_data_sharing_slot *S = DataSharingState.SlotPtr[WID]; + + if (S->Prev) + S = S->Prev; + + // If this will "pop" the last global memory node then it is likely + // that we are at the end of the data sharing region and we can + // de-allocate any existing global memory slots. + if (!S->Prev) { + __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID]; + + while(Tail && Tail->Prev) { + Tail = Tail->Prev; + free(Tail->Next); + Tail->Next=0; + } + } + + return; + } + + // TODO: add memory fence here when this function can be called by + // worker threads also. For now, this function is only called by the + // master thread of each team. + + // TODO: implement sharing across workers. +} + +// Begin a data sharing context. Maintain a list of references to shared +// variables. This list of references to shared variables will be passed +// to one or more threads. +// In L0 data sharing this is called by master thread. +// In L1 data sharing this is called by active warp master thread. +EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) { + omptarget_nvptx_globalArgs.EnsureSize(nArgs); + *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); +} + +// End a data sharing context. There is no need to have a list of refs +// to shared variables because the context in which those variables were +// shared has now ended. This should clean-up the list of references only +// without affecting the actual global storage of the variables. +// In L0 data sharing this is called by master thread. +// In L1 data sharing this is called by active warp master thread. +EXTERN void __kmpc_end_sharing_variables() { + omptarget_nvptx_globalArgs.DeInit(); +} + +// This function will return a list of references to global variables. This +// is how the workers will get a reference to the globalized variable. The +// members of this list will be passed to the outlined parallel function +// preserving the order. +// Called by all workers. +EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) { + *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); +} Index: libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -483,11 +483,20 @@ int32_t *LaneId, int32_t *NumLanes); EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer); + +EXTERN void __kmpc_data_sharing_init_stack(); +EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); +EXTERN void __kmpc_data_sharing_pop_stack(void *a); +EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); +EXTERN void __kmpc_end_sharing_variables(); +EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); + // The slot used for data sharing by the master and worker threads. We use a // complete (default size version and an incomplete one so that we allow sizes // greater than the default). struct __kmpc_data_sharing_slot { __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; void *DataEnd; char Data[]; }; Index: libomptarget/deviceRTLs/nvptx/src/omp_data.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -46,3 +46,8 @@ // Scratchpad for teams reduction. //////////////////////////////////////////////////////////////////////////////// __device__ __shared__ void *ReductionScratchpadPtr; + +//////////////////////////////////////////////////////////////////////////////// +// Data sharing related variables. +//////////////////////////////////////////////////////////////////////////////// +__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -62,6 +62,46 @@ #define __ACTIVEMASK() __ballot(1) #endif +// arguments needed for L0 parallelism only. +class omptarget_nvptx_SharedArgs { +public: + // All these methods must be called by the master thread only. + INLINE void Init() { + args = buffer; + nArgs = MAX_SHARED_ARGS; + } + INLINE void DeInit() { + // Free any memory allocated for outlined parallel function with a large + // number of arguments. + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + Init(); + } + } + INLINE void EnsureSize(size_t size) { + if (size > nArgs) { + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + } + args = (void **) SafeMalloc(size * sizeof(void *), + (char *)"new extended args"); + nArgs = size; + } + } + // Called by all threads. + INLINE void **GetArgs() { return args; }; +private: + // buffer of pre-allocated arguments. + void *buffer[MAX_SHARED_ARGS]; + // pointer to arguments buffer. + // starts off as a pointer to 'buffer' but can be dynamically allocated. + void **args; + // starts off as MAX_SHARED_ARGS but can increase in size. + uint32_t nArgs; +}; + +extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; + // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { // The maximum number of workers in a kernel. @@ -80,6 +120,7 @@ struct DataSharingStateTy { __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; void *StackPtr[DS_Max_Warp_Number]; + __kmpc_data_sharing_slot *TailPtr[DS_Max_Warp_Number]; void *FramePtr[DS_Max_Warp_Number]; int32_t ActiveThreads[DS_Max_Warp_Number]; }; @@ -87,6 +128,7 @@ // size of 4*32 bytes. struct __kmpc_data_sharing_worker_slot_static { __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; void *DataEnd; char Data[DS_Worker_Warp_Slot_Size]; }; @@ -94,6 +136,7 @@ // size of 4 bytes. struct __kmpc_data_sharing_master_slot_static { __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; void *DataEnd; char Data[DS_Slot_Size]; }; @@ -223,6 +266,7 @@ master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; // We currently do not have a next slot. master_rootS[0].Next = 0; + master_rootS[0].Prev = 0; return (__kmpc_data_sharing_slot *)&master_rootS[0]; } // Initialize the pointer to the end of the slot given the size of the data @@ -231,6 +275,7 @@ &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; // We currently do not have a next slot. worker_rootS[wid].Next = 0; + worker_rootS[wid].Prev = 0; return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; } Index: libomptarget/deviceRTLs/nvptx/src/option.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/option.h +++ libomptarget/deviceRTLs/nvptx/src/option.h @@ -28,6 +28,10 @@ // region to synchronize with each other. #define L1_BARRIER (1) +// Maximum number of preallocated arguments to an outlined parallel/simd function. +// Anything more requires dynamic memory allocation. +#define MAX_SHARED_ARGS 20 + // Maximum number of omp state objects per SM allocated statically in global // memory. #if __CUDA_ARCH__ >= 600