diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -68,6 +68,7 @@ ${devicertl_base_directory}/common/src/reduction.cu ${devicertl_base_directory}/common/src/support.cu ${devicertl_base_directory}/common/src/sync.cu + ${devicertl_base_directory}/common/src/target_region.cu ${devicertl_base_directory}/common/src/task.cu) set(h_files @@ -80,6 +81,7 @@ ${devicertl_base_directory}/common/omptargeti.h ${devicertl_base_directory}/common/state-queue.h ${devicertl_base_directory}/common/target_atomic.h + ${devicertl_base_directory}/common/target_region.h ${devicertl_base_directory}/common/state-queuei.h ${devicertl_base_directory}/common/support.h) diff --git a/openmp/libomptarget/deviceRTLs/common/src/target_region.cu b/openmp/libomptarget/deviceRTLs/common/src/target_region.cu new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/src/target_region.cu @@ -0,0 +1,271 @@ +//===-- target_region.cu ---- CUDA impl. of the target region interface -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the common target region interface. +// +//===----------------------------------------------------------------------===// + +#include "common/target_region.h" +#include "common/omptarget.h" +#include "target_impl.h" + +/// Helper structure to manage the memory shared by the threads in a team. +/// +/// Note: Only the team master is allowed to call non-const functions! +struct target_region_shared_buffer { +#define PRE_SHARED_BYTES 128 + + INLINE void init() { + _ptr = &_data[0]; + _size = PRE_SHARED_BYTES; + _offset = 0; + } + + /// Release any dynamic allocated memory. + INLINE void release() { + if (_size == PRE_SHARED_BYTES) + return; + SafeFree(_ptr, "free shared dynamic buffer"); + init(); + } + + INLINE void set(void *ptr, size_t offset) { + release(); + _ptr = ptr; + _offset = offset; + } + + INLINE void resize(size_t size, size_t offset) { + _offset = offset; + + if (size <= _size) + return; + + if (_size != PRE_SHARED_BYTES) + SafeFree(_ptr, "free shared dynamic buffer"); + + _size = size; + _ptr = (char *)SafeMalloc(_size, "new shared buffer"); + } + + // Called by all threads. + INLINE void *begin() const { return _ptr; }; + INLINE size_t size() const { return _size; }; + INLINE size_t get_offset() const { return _offset; }; + +private: + // Pre-allocated space that holds PRE_SHARED_BYTES many bytes. + char _data[PRE_SHARED_BYTES]; + + // Pointer to the currently used buffer. + void *_ptr; + + // Size of the currently used buffer. + uint32_t _size; + + // Offset into the currently used buffer. + uint32_t _offset; + +#undef PRE_SHARED_BYTES +}; + +/// The pointer used to share memory between team threads. +DEVICE SHARED target_region_shared_buffer + _shared_bytes_buffer_memory; + +EXTERN void *__kmpc_target_region_kernel_get_shared_memory() { + return _shared_bytes_buffer_memory.begin(); +} +EXTERN void *__kmpc_target_region_kernel_get_private_memory() { + return ((char *)_shared_bytes_buffer_memory.begin()) + + _shared_bytes_buffer_memory.get_offset(); +} + +/// Simple generic state machine for worker threads. +INLINE static void +__kmpc_target_region_state_machine(ident_t *Ident, + bool IsOMPRuntimeInitialized) { + + do { + void *WorkFn = 0; + + // Wait for the signal that we have a new work function. + __kmpc_barrier_simple_spmd(Ident, 0); + + // Retrieve the work function from the runtime. + bool IsActive = __kmpc_kernel_parallel(&WorkFn, IsOMPRuntimeInitialized); + + // If there is nothing more to do, break out of the state machine by + // returning to the caller. + if (!WorkFn) + return; + + if (IsActive) { + void *SharedVars = __kmpc_target_region_kernel_get_shared_memory(); + void *PrivateVars = __kmpc_target_region_kernel_get_private_memory(); + + ((ParallelWorkFnTy)WorkFn)(SharedVars, PrivateVars); + + __kmpc_kernel_end_parallel(); + } + + __kmpc_barrier_simple_spmd(Ident, 0); + + } while (true); +} + +/// Filter threads into masters and workers. If \p UseStateMachine is true, +/// required workers will enter a state machine through and be trapped there. +/// Master and surplus worker threads will return from this function immediately +/// while required workers will only return once there is no more work. The +/// return value indicates if the thread is a master (1), a surplus worker (0), +/// or a finished required worker released from the state machine (-1). +INLINE static int8_t +__kmpc_target_region_thread_filter(ident_t *Ident, unsigned ThreadLimit, + bool UseStateMachine, + bool IsOMPRuntimeInitialized) { + + unsigned TId = GetThreadIdInBlock(); + bool IsWorker = TId < ThreadLimit; + + if (IsWorker) { + if (UseStateMachine) + __kmpc_target_region_state_machine(Ident, IsOMPRuntimeInitialized); + return -1; + } + + return TId == GetMasterThreadID(); +} + +EXTERN int8_t __kmpc_target_region_kernel_init(ident_t *Ident, bool UseSPMDMode, + bool RequiresOMPRuntime, + bool UseStateMachine, + bool RequiresDataSharing) { + unsigned NumThreads = GetNumberOfThreadsInBlock(); + + // Handle the SPMD case first. + if (UseSPMDMode) { + + __kmpc_spmd_kernel_init(NumThreads, RequiresOMPRuntime, + RequiresDataSharing); + + if (RequiresDataSharing) + __kmpc_data_sharing_init_stack_spmd(); + + return 1; + } + + // Reserve one WARP in non-SPMD mode for the masters. + unsigned ThreadLimit = NumThreads - WARPSIZE; + int8_t FilterVal = __kmpc_target_region_thread_filter( + Ident, ThreadLimit, UseStateMachine, RequiresOMPRuntime); + + // If the filter returns 1 the executing thread is a team master which will + // initialize the kernel in the following. + if (FilterVal == 1) { + __kmpc_kernel_init(ThreadLimit, RequiresOMPRuntime); + __kmpc_data_sharing_init_stack(); + _shared_bytes_buffer_memory.init(); + } + + return FilterVal; +} + +EXTERN void __kmpc_target_region_kernel_deinit(ident_t *Ident, bool UseSPMDMode, + bool RequiredOMPRuntime) { + // Handle the SPMD case first. + if (UseSPMDMode) { + __kmpc_spmd_kernel_deinit_v2(RequiredOMPRuntime); + return; + } + + __kmpc_kernel_deinit(RequiredOMPRuntime); + + // Barrier to terminate worker threads. + __kmpc_barrier_simple_spmd(Ident, 0); + + // Release any dynamically allocated memory used for sharing. + _shared_bytes_buffer_memory.release(); +} + +EXTERN void __kmpc_target_region_kernel_parallel( + ident_t *Ident, int16_t UseSPMDMode, bool RequiredOMPRuntime, + ParallelWorkFnTy ParallelWorkFn, void *SharedVars, uint16_t SharedVarsBytes, + void *PrivateVars, uint16_t PrivateVarsBytes, bool SharedMemPointers) { + + // If the mode is unknown we check it at runtime + if (UseSPMDMode == -1) + UseSPMDMode = __kmpc_is_spmd_exec_mode(); + + // In SPMD mode, we simply call the work function with the provided values. + if (UseSPMDMode) { + ParallelWorkFn(SharedVars, PrivateVars); + return; + } + + if (SharedMemPointers) { + // If shared memory pointers are used, the user guarantees that all private + // variables, if any, are stored directly after the shared ones in memory. + // Additionally, this memory can be accessed by all the threads. In that + // case, we do not need to copy memory around but simply use the provided + // locations. However, we still need to inform the buffer of these + // locations as the worker threads might use the + // __kmpc_target_region_kernel_get_shared_memory() + // and + // __kmpc_target_region_kernel_get_private_memory() + // functions to get the respective pointers. + + _shared_bytes_buffer_memory.set(SharedVars, SharedVarsBytes); + + } else { + + size_t BytesToCopy = SharedVarsBytes + PrivateVarsBytes; + if (BytesToCopy) { + // Resize the shared memory to be able to hold the data which is required + // to be in shared memory. Also set the offset to the beginning to the + // private variables. + _shared_bytes_buffer_memory.resize(BytesToCopy, SharedVarsBytes); + + // Copy the shared and private variables into shared memory. + void *SVMemory = __kmpc_target_region_kernel_get_shared_memory(); + void *PVMemory = __kmpc_target_region_kernel_get_private_memory(); + __builtin_memcpy(SVMemory, SharedVars, SharedVarsBytes); + __builtin_memcpy(PVMemory, PrivateVars, PrivateVarsBytes); + } + } + + // TODO: It seems we could store the work function in the same shared space + // as the rest of the variables above. + // + // Initialize the parallel work, e.g., make sure the work function is known. + __kmpc_kernel_prepare_parallel((void *)ParallelWorkFn, RequiredOMPRuntime); + + // TODO: It is odd that we call the *_spmd version in non-SPMD mode here. + // + // Activate workers. This barrier is used by the master to signal + // work for the workers. + __kmpc_barrier_simple_spmd(Ident, 0); + + // OpenMP [2.5, Parallel Construct, p.49] + // There is an implied barrier at the end of a parallel region. After the + // end of a parallel region, only the master thread of the team resumes + // execution of the enclosing task region. + // + // The master waits at this barrier until all workers are done. + __kmpc_barrier_simple_spmd(Ident, 0); + + // Update the shared variables if necessary, that is if we did not use user + // memory in the first . + if (!SharedMemPointers && SharedVarsBytes) + __builtin_memcpy(SharedVars, __kmpc_target_region_kernel_get_shared_memory(), + SharedVarsBytes); + + // We could set (or reset) the _shared_bytes_buffer_memory pointer to NULL (or + // the old value) if we used user provided memory. This is not necessary as + // long as the buffer knows not to free the explicitly "set" pointer. +} \ No newline at end of file diff --git a/openmp/libomptarget/deviceRTLs/common/target_region.h b/openmp/libomptarget/deviceRTLs/common/target_region.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/target_region.h @@ -0,0 +1,171 @@ +//===-- target_region.h -- Target region OpenMP device runtime interface --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target region interfaces are simple interfaces designed to allow middle-end +// (=LLVM) passes to analyze and transform the code. To achieve good performance +// it may be required to run the associated passes. However, implementations of +// this interface shall always provide a correct implementation as close to the +// user expected code as possible. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_TARGET_REGION_H_ +#define _OMPTARGET_TARGET_REGION_H_ + +#include "interface.h" + +/// The target region _kernel_ interface for GPUs +/// +/// This deliberately simple interface provides the middle-end (=LLVM) with +/// easier means to reason about the semantics of the code and transform it as +/// well. The runtime calls are therefore also designed to carry sufficient +/// information necessary for optimization. +/// +/// +/// Intended usage: +/// +/// \code +/// void kernel(...) { +/// +/// char ThreadKind = __kmpc_target_region_kernel_init(...); +/// +/// if (ThreadKind == -1) { // actual worker thread +/// if (!UsedLibraryStateMachine) +/// user_state_machine(); +/// goto exit; +/// } else if (ThreadKind == 0) { // surplus worker thread +/// goto exit; +/// } else { // team master thread +/// goto user_code; +/// } +/// +/// user_code: +/// +/// // User defined kernel code, parallel regions are replaced by +/// // by __kmpc_target_region_kernel_parallel(...) calls. +/// +/// // Fallthrough to de-initialization +/// +/// deinit: +/// __kmpc_target_region_kernel_deinit(...); +/// +/// exit: +/// /* exit the kernel */ +/// } +/// \endcode +/// +/// +///{ + +/// Initialization +/// +/// +/// In SPMD mode, all threads will execute their respective initialization +/// routines. +/// +/// In non-SPMD mode, team masters will invoke the initialization routines while +/// the rest are considered worker threads. Worker threads required for this +/// target region will be trapped inside the function if \p UseStateMachine is +/// true. Otherwise they will escape with a return value of -1 +/// +/// \param Ident Source location identification, can be NULL. +/// \param UseSPMDMode Flag to indicate if execution is performed in +/// SPMD mode. +/// \param RequiresOMPRuntime Flag to indicate if the runtime is required and +/// needs to be initialized. +/// \param UseStateMachine Flag to indicate if the runtime state machine +/// should be used in non-SPMD mode. +/// \param RequiresDataSharing Flag to indicate if there might be inter-thread +/// sharing which needs runtime support. +/// +/// \return 1, always in SPMD mode, and in non-SPMD mode if the thread is the +/// team master. +/// 0, in non-SPMD mode and the thread is a surplus worker that should +/// not execute anything in the target region. +/// -1, in non-SPMD mode and the thread is a required worker which: +/// - finished work and should be terminated if \p UseStateMachine +/// is true. +/// - has not performed work and should be put in a user provied +/// state machine (as defined above). +/// +EXTERN int8_t __kmpc_target_region_kernel_init(ident_t *Ident, + bool UseSPMDMode, + bool RequiresOMPRuntime, + bool UseStateMachine, + bool RequiresDataSharing); + +/// De-Initialization +/// +/// +/// In non-SPMD, this function releases the workers trapped in a state machine +/// and also any memory dynamically allocated by the runtime. +/// +/// \param Ident Source location identification, can be NULL. +/// \param UseSPMDMode Flag to indicate if execution is performed in +/// SPMD mode. +/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and +/// is therefore initialized. +/// +EXTERN void __kmpc_target_region_kernel_deinit(ident_t *Ident, + bool UseSPMDMode, + bool RequiredOMPRuntime); + +/// Generic type of a work function in the target region kernel interface. The +/// two arguments are pointers to structures that contains the shared and +/// firstprivate variables respectively. Since the layout and size was known at +/// compile time, the front-end is expected to generate appropriate packing and +/// unpacking code. +typedef void (*ParallelWorkFnTy)(void * /* SharedValues */, + void * /* PrivateValues */); + +/// Enter a parallel region +/// +/// +/// The parallel region is defined by \p ParallelWorkFn. The shared variables, +/// \p SharedMemorySize bytes in total, start at \p SharedValues. The +/// firstprivate variables, \p PrivateValuesBytes bytes in total, start at +/// \p PrivateValues. +/// +/// In SPMD mode, this function calls \p ParallelWorkFn with \p SharedValues and +/// \p PrivateValues as arguments before it returns. +/// +/// In non-SPMD mode, \p ParallelWorkFn, \p SharedValues, and \p PrivateValues +/// are communicated to the workers before they are released from the state +/// machine to run the code defined by \p ParallelWorkFn in parallel. This +/// function will only return after all workers are finished. +/// +/// \param Ident Source location identification, can be NULL. +/// \param UseSPMDMode Flag to indicate if execution is performed in +/// SPMD mode with three potential values: +/// -1, to indicate unknown mode, a runtime check +/// should then determine the current mode. +/// 0, to indicate no SPMD mode. +/// 1, to indicate SPMD mode. +/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and +/// is therefore initialized. +/// \param ParallelWorkFn The outlined code that is executed in parallel by +/// the threads in the team. +/// \param SharedValues A pointer to the location of all shared values. +/// \param SharedValuesBytes The total size of the shared values in bytes. +/// \param PrivateValues A pointer to the location of all private values. +/// \param PrivateValuesBytes The total size of the private values in bytes. +/// \param SharedMemPointers Flag to indicate that the pointer \p SharedValues +/// and \p PrivateValues point into shared memory. +/// If this flag is true, it also requires that all +/// private values, if any, are stored directly after +/// the shared values. +/// +EXTERN void __kmpc_target_region_kernel_parallel( + ident_t *Ident, int16_t UseSPMDMode, bool RequiredOMPRuntime, + ParallelWorkFnTy ParallelWorkFn, void *SharedValues, + uint16_t SharedValuesBytes, void *PrivateValues, + uint16_t PrivateValuesBytes, bool SharedMemPointers); + +///} + +#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -61,6 +61,7 @@ ${devicertl_common_directory}/src/reduction.cu ${devicertl_common_directory}/src/support.cu ${devicertl_common_directory}/src/sync.cu + ${devicertl_common_directory}/src/target_region.cu ${devicertl_common_directory}/src/task.cu src/target_impl.cu )