diff --git a/openmp/libomptarget/deviceRTLs/common/target_region.h b/openmp/libomptarget/deviceRTLs/common/target_region.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/target_region.h @@ -0,0 +1,167 @@ +//===-- target_region.h --- Target region OpenMP devie runtime interface --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target region interfaces are simple interfaces designed to allow middle-end +// (=LLVM) passes to analyze and transform the code. To achieve good performance +// it may be required to run the associated passes. However, implementations of +// this interface shall always provide a correct implementation as close to the +// user expected code as possible. +// +//===----------------------------------------------------------------------===// + +#ifndef _DEVICERTL_COMMON_INTERFACES_H_ +#define _DEVICERTL_COMMON_INTERFACES_H_ + +#ifndef EXTERN +#define EXTERN +#endif +#ifndef CALLBACK +#define CALLBACK(Callee, Payload0, Payload1) +#endif + +/// The target region _kernel_ interface for GPUs +/// +/// This deliberatly simple interface provides the middle-end (=LLVM) with +/// easier means to reason about the semantic of the code and transform it as +/// well. The runtime calls are therefore also desiged to carry sufficient +/// information necessary for optimization. +/// +/// +/// Intended usage: +/// +/// \code +/// void kernel(...) { +/// +/// char ThreadKind = __kmpc_target_region_kernel_init(...); +/// +/// if (ThreadKind == -1) { // actual worker thread +/// if (!UsedLibraryStateMachine) +/// user_state_machine(); +/// goto exit; +/// } else if (ThreadKind == 0) { // surplus worker thread +/// goto exit; +/// } else { // team master thread +/// goto user_code; +/// } +/// +/// user_code: +/// +/// // User defined kernel code, parallel regions are replaced by +/// // by __kmpc_target_region_kernel_parallel(...) calls. +/// +/// // Fallthrough to de-initialization +/// +/// deinit: +/// __kmpc_target_region_kernel_deinit(...); +/// +/// exit: +/// /* exit the kernel */ +/// } +/// \endcode +/// +/// +///{ + +/// Initialization +/// +/// +/// In SPMD mode, all threads will execute their respective initialization +/// routines. +/// +/// In non-SPMD mode, team masters will invoke the initialization routines while +/// the rest is considered a worker thread. Worker threads required for this +/// target region will be trapped inside the function if \p UseStateMachine is +/// true. Otherwise they will escape with a return value of -1 +/// +/// \param UseSPMDMode Flag to indicate if execution is performed in +/// SPMD mode. +/// \param RequiresOMPRuntime Flag to indicate if the runtime is required and +/// needs to be initialized. +/// \param UseStateMachine Flag to indicate if the runtime state machine +/// should be used in non-SPMD mode. +/// \param RequiresDataSharing Flag to indicate if there might be inter-thread +/// sharing which needs runtime support. +/// +/// \return 1, always in SPMD mode, and in non-SPMD mode if the thread is the +/// team master. +/// 0, in non-SPMD mode and the thread is a surplus worker that should +/// not execute anything in the target region. +/// -1, in non-SPMD mode and the thread is a required worker which: +/// - finished work and should be terminated if \p UseStateMachine +/// is true. +/// - has not performed work and should be put in a user provied +/// state machine (as defined above). +/// +EXTERN int8_t __kmpc_target_region_kernel_init(bool UseSPMDMode, + bool RequiresOMPRuntime, + bool UseStateMachine, + bool RequiresDataSharing); + +/// De-Initialization +/// +/// +/// In non-SPMD, this function releases the workers trapped in a state machine +/// and also any memory dynamically allocated by the runtime. +/// +/// \param UseSPMDMode Flag to indicate if execution is performed in +/// SPMD mode. +/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and +/// is therefore initialized. +/// +EXTERN void __kmpc_target_region_kernel_deinit(bool UseSPMDMode, + bool RequiredOMPRuntime); + +/// Generic type of a work function in the target region kernel interface. The +/// two arguments are pointers to structures that contains the shared and +/// firstprivate variables respectively. Since the layout and size was known at +/// compile time, the front-end is expected to generate appropriate packing and +/// unpacking code. +typedef void (*ParallelWorkFnTy)(char * /* SharedValues */, + char * /* PrivateValues */); + +/// Enter a parallel region +/// +/// +/// The parallel region is defined by \p ParallelWorkFn. The shared variables, +/// \p SharedMemorySize bytes in total, start at \p SharedValues. The +/// firstprivate variables, \p PrivateValuesBytes bytes in total, start at +/// \p PrivateValues. +/// +/// In SPMD mode, this function calls \p ParallelWorkFn with \p SharedValues and +/// \p PrivateValues as arguments before it returns. +/// +/// In non-SPMD mode, \p ParallelWorkFn, \p SharedValues, and \p PrivateValues +/// are communicated to the workers before they are released from the state +/// machine to run the code defined by \p ParallelWorkFn in parallel. This +/// function will only return after all workers are finished. +/// +/// \param UseSPMDMode Flag to indicate if execution is performed in +/// SPMD mode. +/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and +/// is therefore initialized. +/// \param ParallelWorkFn The outlined code that is executed in parallel by +/// the threads in the team. +/// \param SharedValues A pointer to the location of all shared values. +/// \param SharedValuesBytes The total size of the shared values in bytes. +/// \param PrivateValues A pointer to the location of all private values. +/// \param PrivateValuesBytes The total size of the private values in bytes. +/// \param SharedMemPointers Flag to indicate that the pointer \p SharedValues +/// and \p PrivateValues point into shared memory. +/// If this flag is true, it also requires that all +/// private values, if any, are stored directly after +/// the shared values. +/// +CALLBACK(ParallelWorkFnTy, SharedValues, PrivateValues) +EXTERN void __kmpc_target_region_kernel_parallel( + bool UseSPMDMode, bool RequiredOMPRuntime, ParallelWorkFnTy ParallelWorkFn, + char *SharedValues, uint16_t SharedValuesBytes, char *PrivateValues, + uint16_t PrivateValuesBytes, bool SharedMemPointers); + +///} + +#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -53,6 +53,7 @@ src/reduction.cu src/sync.cu src/task.cu + src/target_region.cu ) set(omp_data_objects src/omp_data.cu) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -63,3 +63,9 @@ // Data sharing related variables. //////////////////////////////////////////////////////////////////////////////// __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; + +//////////////////////////////////////////////////////////////////////////////// +/// Pointer to share memory between team threads in the target region interface. +//////////////////////////////////////////////////////////////////////////////// +__device__ __shared__ target_region_shared_buffer _target_region_shared_memory; + diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -101,6 +101,66 @@ uint32_t nArgs; }; +/// Helper structure to manage the memory shared by the threads in a team. +/// +/// Note: Only the team master is allowed to call non-const functions! +struct target_region_shared_buffer { +#define PRE_SHARED_BYTES 128 + + INLINE void init() { + _ptr = &_data[0]; + _size = PRE_SHARED_BYTES; + _offset = 0; + } + + /// Release any dynamic allocated memory. + INLINE void release() { + if (_size == PRE_SHARED_BYTES) + return; + SafeFree(_ptr, (char *)"free shared dynamic buffer"); + init(); + } + + INLINE void set(char *ptr, size_t offset) { + release(); + _ptr = ptr; + _offset = offset; + } + + INLINE void resize(size_t size, size_t offset) { + _offset = offset; + + if (size <= _size) + return; + + if (_size != PRE_SHARED_BYTES) + SafeFree(_ptr, (char *)"free shared dynamic buffer"); + + _size = size; + _ptr = (char *)SafeMalloc(_size, (char *)"new shared buffer"); + } + + // Called by all threads. + INLINE char *begin() const { return _ptr; }; + INLINE size_t size() const { return _size; }; + INLINE size_t get_offset() const { return _offset; }; + +private: + // Pre-allocated space that holds PRE_SHARED_BYTES many bytes. + char _data[PRE_SHARED_BYTES]; + + // Pointer to the currently used buffer. + char *_ptr; + + // Size of the currently used buffer. + uint32_t _size; + + // Offset into the currently used buffer. + uint32_t _offset; + +#undef PRE_SHARED_BYTES +}; + extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu @@ -0,0 +1,197 @@ +//===-- target_region.cu ---- CUDA impl. of the target region interface -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the common target region interface. +// +//===----------------------------------------------------------------------===// + +// Include the native definitions first as certain defines might be needed in +// the common interface definition below. +#include "omptarget-nvptx.h" +#include "interface.h" + +#include "../../common/target_region.h" + +/// The pointer used to share memory between team threads. +extern __device__ __shared__ target_region_shared_buffer + _target_region_shared_memory; + +EXTERN char *__kmpc_target_region_kernel_get_shared_memory() { + return _target_region_shared_memory.begin(); +} +EXTERN char *__kmpc_target_region_kernel_get_private_memory() { + return _target_region_shared_memory.begin() + + _target_region_shared_memory.get_offset(); +} + +/// Simple generic state machine for worker threads. +INLINE static void +__kmpc_target_region_state_machine(bool IsOMPRuntimeInitialized) { + + do { + void *WorkFn = 0; + + // Wait for the signal that we have a new work function. + __kmpc_barrier_simple_spmd(NULL, 0); + + // Retrieve the work function from the runtime. + bool IsActive = __kmpc_kernel_parallel(&WorkFn, IsOMPRuntimeInitialized); + + // If there is nothing more to do, break out of the state machine by + // returning to the caller. + if (!WorkFn) + return; + + if (IsActive) { + char *SharedVars = __kmpc_target_region_kernel_get_shared_memory(); + char *PrivateVars = __kmpc_target_region_kernel_get_private_memory(); + + ((ParallelWorkFnTy)WorkFn)(SharedVars, PrivateVars); + + __kmpc_kernel_end_parallel(); + } + + __kmpc_barrier_simple_spmd(NULL, 0); + + } while (true); +} + +/// Filter threads into masters and workers. If \p UseStateMachine is true, +/// required workers will enter a state machine through and be trapped there. +/// Master and surplus worker threads will return from this function immediately +/// while required workers will only return once there is no more work. The +/// return value indicates if the thread is a master (1), a surplus worker (0), +/// or a finished required worker released from the state machine (-1). +INLINE static int8_t +__kmpc_target_region_thread_filter(unsigned ThreadLimit, bool UseStateMachine, + bool IsOMPRuntimeInitialized) { + + unsigned TId = GetThreadIdInBlock(); + bool IsWorker = TId < ThreadLimit; + + if (IsWorker) { + if (UseStateMachine) + __kmpc_target_region_state_machine(IsOMPRuntimeInitialized); + return -1; + } + + return TId == GetMasterThreadID(); +} + +EXTERN int8_t __kmpc_target_region_kernel_init(bool UseSPMDMode, + bool UseStateMachine, + bool RequiresOMPRuntime, + bool RequiresDataSharing) { + unsigned NumThreads = GetNumberOfThreadsInBlock(); + + // Handle the SPMD case first. + if (UseSPMDMode) { + + __kmpc_spmd_kernel_init(NumThreads, RequiresOMPRuntime, + RequiresDataSharing); + + if (RequiresDataSharing) + __kmpc_data_sharing_init_stack_spmd(); + + return 1; + } + + // Reserve one WARP in non-SPMD mode for the masters. + unsigned ThreadLimit = NumThreads - WARPSIZE; + int8_t FilterVal = __kmpc_target_region_thread_filter( + ThreadLimit, UseStateMachine, RequiresOMPRuntime); + + // If the filter returns 1 the executing thread is a team master which will + // initialize the kernel in the following. + if (FilterVal == 1) { + __kmpc_kernel_init(ThreadLimit, RequiresOMPRuntime); + __kmpc_data_sharing_init_stack(); + _target_region_shared_memory.init(); + } + + return FilterVal; +} + +EXTERN void __kmpc_target_region_kernel_deinit(bool UseSPMDMode, + bool RequiredOMPRuntime) { + // Handle the SPMD case first. + if (UseSPMDMode) { + __kmpc_spmd_kernel_deinit_v2(RequiredOMPRuntime); + return; + } + + __kmpc_kernel_deinit(RequiredOMPRuntime); + + // Barrier to terminate worker threads. + __kmpc_barrier_simple_spmd(NULL, 0); + + // Release any dynamically allocated memory used for sharing. + _target_region_shared_memory.release(); +} + +EXTERN void __kmpc_target_region_kernel_parallel( + bool UseSPMDMode, bool RequiredOMPRuntime, ParallelWorkFnTy ParallelWorkFn, + char *SharedVars, uint16_t SharedVarsBytes, char *PrivateVars, + uint16_t PrivateVarsBytes, bool SharedMemPointers) { + + if (UseSPMDMode) { + ParallelWorkFn(SharedVars, PrivateVars); + return; + } + + if (SharedMemPointers) { + // If shared memory pointers are used the user guarantees that any private + // variables, if any, are stored directly after the shared ones in memory + // and that this memory can be accessed by all the threads. In that case, + // we do not need to copy memory around but simply use the provided + // locations. + + _target_region_shared_memory.set(SharedVars, SharedVarsBytes); + + } else { + + size_t BytesToCopy = SharedVarsBytes + PrivateVarsBytes; + if (BytesToCopy) { + // Resize the shared memory to be able to hold the data which is required + // to be in shared memory. Also set the offset to the beginning to the + // private variables. + _target_region_shared_memory.resize(BytesToCopy, SharedVarsBytes); + + // Copy the shared and private variables into shared memory. + char *SVMemory = __kmpc_target_region_kernel_get_shared_memory(); + char *PVMemory = __kmpc_target_region_kernel_get_private_memory(); + memcpy(SVMemory, SharedVars, SharedVarsBytes); + memcpy(PVMemory, PrivateVars, PrivateVarsBytes); + } + } + + // TODO: It seems we could store the work function in the same shared space + // as the rest of the variables above. + // + // Initialize the parallel work, e.g., make sure the work function is known. + __kmpc_kernel_prepare_parallel((void *)ParallelWorkFn, RequiredOMPRuntime); + + // TODO: It is odd that we call the *_spmd version in non-SPMD mode here. + // + // Activate workers. This barrier is used by the master to signal + // work for the workers. + __kmpc_barrier_simple_spmd(NULL, 0); + + // OpenMP [2.5, Parallel Construct, p.49] + // There is an implied barrier at the end of a parallel region. After the + // end of a parallel region, only the master thread of the team resumes + // execution of the enclosing task region. + // + // The master waits at this barrier until all workers are done. + __kmpc_barrier_simple_spmd(NULL, 0); + + // Update the shared variables if necessary. + if (!SharedVars && SharedVarsBytes) + memcpy(SharedVars, __kmpc_target_region_kernel_get_shared_memory(), + SharedVarsBytes); +}