diff --git a/openmp/libomptarget/deviceRTLs/common/target_region.h b/openmp/libomptarget/deviceRTLs/common/target_region.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/common/target_region.h @@ -0,0 +1,180 @@ +//===-- target_region.h --- Target region OpenMP devie runtime interface --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target region interfaces are simple interfaces designed to allow middle-end +// (=LLVM) passes to analyze and transform the code. To achieve good performance +// it may be required to run the associated passes. However, implementations of +// this interface shall always provide a correct implementation as close to the +// user expected code as possible. +// +//===----------------------------------------------------------------------===// + +#ifndef _DEVICERTL_COMMON_INTERFACES_H_ +#define _DEVICERTL_COMMON_INTERFACES_H_ + +#ifndef EXTERN +#define EXTERN +#endif +#ifndef CALLBACK +#define CALLBACK(Callee, Payload0, Payload1) +#endif + +/// Forward declaration of the source location identifier "ident". +typedef struct ident ident_t; + +/// The target region _kernel_ interface for GPUs +/// +/// This deliberatly simple interface provides the middle-end (=LLVM) with +/// easier means to reason about the semantic of the code and transform it as +/// well. The runtime calls are therefore also desiged to carry sufficient +/// information necessary for optimization. +/// +/// +/// Intended usage: +/// +/// \code +/// void kernel(...) { +/// +/// char ThreadKind = __kmpc_target_region_kernel_init(...); +/// +/// if (ThreadKind == -1) { // actual worker thread +/// if (!UsedLibraryStateMachine) +/// user_state_machine(); +/// goto exit; +/// } else if (ThreadKind == 0) { // surplus worker thread +/// goto exit; +/// } else { // team master thread +/// goto user_code; +/// } +/// +/// user_code: +/// +/// // User defined kernel code, parallel regions are replaced by +/// // by __kmpc_target_region_kernel_parallel(...) calls. +/// +/// // Fallthrough to de-initialization +/// +/// deinit: +/// __kmpc_target_region_kernel_deinit(...); +/// +/// exit: +/// /* exit the kernel */ +/// } +/// \endcode +/// +/// +///{ + +/// Initialization +/// +/// +/// In SPMD mode, all threads will execute their respective initialization +/// routines. +/// +/// In non-SPMD mode, team masters will invoke the initialization routines while +/// the rest is considered a worker thread. Worker threads required for this +/// target region will be trapped inside the function if \p UseStateMachine is +/// true. Otherwise they will escape with a return value of -1 +/// +/// \param Ident Source location identification, can be NULL. +/// \param UseSPMDMode Flag to indicate if execution is performed in +/// SPMD mode. +/// \param RequiresOMPRuntime Flag to indicate if the runtime is required and +/// needs to be initialized. +/// \param UseStateMachine Flag to indicate if the runtime state machine +/// should be used in non-SPMD mode. +/// \param RequiresDataSharing Flag to indicate if there might be inter-thread +/// sharing which needs runtime support. +/// +/// \return 1, always in SPMD mode, and in non-SPMD mode if the thread is the +/// team master. +/// 0, in non-SPMD mode and the thread is a surplus worker that should +/// not execute anything in the target region. +/// -1, in non-SPMD mode and the thread is a required worker which: +/// - finished work and should be terminated if \p UseStateMachine +/// is true. +/// - has not performed work and should be put in a user provied +/// state machine (as defined above). +/// +EXTERN int8_t __kmpc_target_region_kernel_init(ident_t *Ident, + bool UseSPMDMode, + bool RequiresOMPRuntime, + bool UseStateMachine, + bool RequiresDataSharing); + +/// De-Initialization +/// +/// +/// In non-SPMD, this function releases the workers trapped in a state machine +/// and also any memory dynamically allocated by the runtime. +/// +/// \param Ident Source location identification, can be NULL. +/// \param UseSPMDMode Flag to indicate if execution is performed in +/// SPMD mode. +/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and +/// is therefore initialized. +/// +EXTERN void __kmpc_target_region_kernel_deinit(ident_t *Ident, + bool UseSPMDMode, + bool RequiredOMPRuntime); + +/// Generic type of a work function in the target region kernel interface. The +/// two arguments are pointers to structures that contains the shared and +/// firstprivate variables respectively. Since the layout and size was known at +/// compile time, the front-end is expected to generate appropriate packing and +/// unpacking code. +typedef void (*ParallelWorkFnTy)(void * /* SharedValues */, + void * /* PrivateValues */); + +/// Enter a parallel region +/// +/// +/// The parallel region is defined by \p ParallelWorkFn. The shared variables, +/// \p SharedMemorySize bytes in total, start at \p SharedValues. The +/// firstprivate variables, \p PrivateValuesBytes bytes in total, start at +/// \p PrivateValues. +/// +/// In SPMD mode, this function calls \p ParallelWorkFn with \p SharedValues and +/// \p PrivateValues as arguments before it returns. +/// +/// In non-SPMD mode, \p ParallelWorkFn, \p SharedValues, and \p PrivateValues +/// are communicated to the workers before they are released from the state +/// machine to run the code defined by \p ParallelWorkFn in parallel. This +/// function will only return after all workers are finished. +/// +/// \param Ident Source location identification, can be NULL. +/// \param UseSPMDMode Flag to indicate if execution is performed in +/// SPMD mode with three potential values: +/// -1, to indicate unknown mode, a runtime check +/// should then determine the current mode. +/// 0, to indicate no SPMD mode. +/// 1, to indicate SPMD mode. +/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and +/// is therefore initialized. +/// \param ParallelWorkFn The outlined code that is executed in parallel by +/// the threads in the team. +/// \param SharedValues A pointer to the location of all shared values. +/// \param SharedValuesBytes The total size of the shared values in bytes. +/// \param PrivateValues A pointer to the location of all private values. +/// \param PrivateValuesBytes The total size of the private values in bytes. +/// \param SharedMemPointers Flag to indicate that the pointer \p SharedValues +/// and \p PrivateValues point into shared memory. +/// If this flag is true, it also requires that all +/// private values, if any, are stored directly after +/// the shared values. +/// +CALLBACK(ParallelWorkFnTy, SharedValues, PrivateValues) +EXTERN void __kmpc_target_region_kernel_parallel( + ident_t *Ident, uint16_t UseSPMDMode, bool RequiredOMPRuntime, + ParallelWorkFnTy ParallelWorkFn, void *SharedValues, + uint16_t SharedValuesBytes, void *PrivateValues, + uint16_t PrivateValuesBytes, bool SharedMemPointers); + +///} + +#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -53,6 +53,7 @@ src/reduction.cu src/sync.cu src/task.cu + src/target_region.cu ) set(omp_data_objects src/omp_data.cu) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu @@ -0,0 +1,210 @@ +//===-- target_region.cu ---- CUDA impl. of the target region interface -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the common target region interface. +// +//===----------------------------------------------------------------------===// + +// Include the native definitions first as certain defines might be needed in +// the common interface definition below. +#include "omptarget-nvptx.h" +#include "interface.h" + +#include "../../common/target_region.h" + +EXTERN void *__kmpc_target_region_kernel_get_shared_memory() { + return _shared_bytes_buffer_memory.begin(); +} +EXTERN void *__kmpc_target_region_kernel_get_private_memory() { + return ((char *)_shared_bytes_buffer_memory.begin()) + + _shared_bytes_buffer_memory.get_offset(); +} + +/// Simple generic state machine for worker threads. +INLINE static void +__kmpc_target_region_state_machine(ident_t *Ident, + bool IsOMPRuntimeInitialized) { + + do { + void *WorkFn = 0; + + // Wait for the signal that we have a new work function. + __kmpc_barrier_simple_spmd(Ident, 0); + + // Retrieve the work function from the runtime. + bool IsActive = __kmpc_kernel_parallel(&WorkFn, IsOMPRuntimeInitialized); + + // If there is nothing more to do, break out of the state machine by + // returning to the caller. + if (!WorkFn) + return; + + if (IsActive) { + void *SharedVars = __kmpc_target_region_kernel_get_shared_memory(); + void *PrivateVars = __kmpc_target_region_kernel_get_private_memory(); + + ((ParallelWorkFnTy)WorkFn)(SharedVars, PrivateVars); + + __kmpc_kernel_end_parallel(); + } + + __kmpc_barrier_simple_spmd(Ident, 0); + + } while (true); +} + +/// Filter threads into masters and workers. If \p UseStateMachine is true, +/// required workers will enter a state machine through and be trapped there. +/// Master and surplus worker threads will return from this function immediately +/// while required workers will only return once there is no more work. The +/// return value indicates if the thread is a master (1), a surplus worker (0), +/// or a finished required worker released from the state machine (-1). +INLINE static int8_t +__kmpc_target_region_thread_filter(ident_t *Ident, unsigned ThreadLimit, + bool UseStateMachine, + bool IsOMPRuntimeInitialized) { + + unsigned TId = GetThreadIdInBlock(); + bool IsWorker = TId < ThreadLimit; + + if (IsWorker) { + if (UseStateMachine) + __kmpc_target_region_state_machine(Ident, IsOMPRuntimeInitialized); + return -1; + } + + return TId == GetMasterThreadID(); +} + +EXTERN int8_t __kmpc_target_region_kernel_init(ident_t *Ident, bool UseSPMDMode, + bool RequiresOMPRuntime, + bool UseStateMachine, + bool RequiresDataSharing) { + unsigned NumThreads = GetNumberOfThreadsInBlock(); + + // Handle the SPMD case first. + if (UseSPMDMode) { + + __kmpc_spmd_kernel_init(NumThreads, RequiresOMPRuntime, + RequiresDataSharing); + + if (RequiresDataSharing) + __kmpc_data_sharing_init_stack_spmd(); + + return 1; + } + + // Reserve one WARP in non-SPMD mode for the masters. + unsigned ThreadLimit = NumThreads - WARPSIZE; + int8_t FilterVal = __kmpc_target_region_thread_filter( + Ident, ThreadLimit, UseStateMachine, RequiresOMPRuntime); + + // If the filter returns 1 the executing thread is a team master which will + // initialize the kernel in the following. + if (FilterVal == 1) { + __kmpc_kernel_init(ThreadLimit, RequiresOMPRuntime); + __kmpc_data_sharing_init_stack(); + _shared_bytes_buffer_memory.init(); + } + + return FilterVal; +} + +EXTERN void __kmpc_target_region_kernel_deinit(ident_t *Ident, bool UseSPMDMode, + bool RequiredOMPRuntime) { + // Handle the SPMD case first. + if (UseSPMDMode) { + __kmpc_spmd_kernel_deinit_v2(RequiredOMPRuntime); + return; + } + + __kmpc_kernel_deinit(RequiredOMPRuntime); + + // Barrier to terminate worker threads. + __kmpc_barrier_simple_spmd(Ident, 0); + + // Release any dynamically allocated memory used for sharing. + _shared_bytes_buffer_memory.release(); +} + +EXTERN void __kmpc_target_region_kernel_parallel( + ident_t *Ident, uint16_t UseSPMDMode, bool RequiredOMPRuntime, + ParallelWorkFnTy ParallelWorkFn, void *SharedVars, uint16_t SharedVarsBytes, + void *PrivateVars, uint16_t PrivateVarsBytes, bool SharedMemPointers) { + + // If the mode is unknown we check it at runtime + if (UseSPMDMode == -1) + UseSPMDMode = __kmpc_is_spmd_exec_mode(); + + // In SPMD mode, we simply call the work function with the provided values. + if (UseSPMDMode) { + ParallelWorkFn(SharedVars, PrivateVars); + return; + } + + if (SharedMemPointers) { + // If shared memory pointers are used, the user guarantees that all private + // variables, if any, are stored directly after the shared ones in memory. + // Additionally, this memory can be accessed by all the threads. In that + // case, we do not need to copy memory around but simply use the provided + // locations. However, we still need to inform the buffer of these + // locations as the worker threads might use the + // __kmpc_target_region_kernel_get_shared_memory() + // and + // __kmpc_target_region_kernel_get_private_memory() + // functions to get the respective pointers. + + _shared_bytes_buffer_memory.set(SharedVars, SharedVarsBytes); + + } else { + + size_t BytesToCopy = SharedVarsBytes + PrivateVarsBytes; + if (BytesToCopy) { + // Resize the shared memory to be able to hold the data which is required + // to be in shared memory. Also set the offset to the beginning to the + // private variables. + _shared_bytes_buffer_memory.resize(BytesToCopy, SharedVarsBytes); + + // Copy the shared and private variables into shared memory. + void *SVMemory = __kmpc_target_region_kernel_get_shared_memory(); + void *PVMemory = __kmpc_target_region_kernel_get_private_memory(); + memcpy(SVMemory, SharedVars, SharedVarsBytes); + memcpy(PVMemory, PrivateVars, PrivateVarsBytes); + } + } + + // TODO: It seems we could store the work function in the same shared space + // as the rest of the variables above. + // + // Initialize the parallel work, e.g., make sure the work function is known. + __kmpc_kernel_prepare_parallel((void *)ParallelWorkFn, RequiredOMPRuntime); + + // TODO: It is odd that we call the *_spmd version in non-SPMD mode here. + // + // Activate workers. This barrier is used by the master to signal + // work for the workers. + __kmpc_barrier_simple_spmd(Ident, 0); + + // OpenMP [2.5, Parallel Construct, p.49] + // There is an implied barrier at the end of a parallel region. After the + // end of a parallel region, only the master thread of the team resumes + // execution of the enclosing task region. + // + // The master waits at this barrier until all workers are done. + __kmpc_barrier_simple_spmd(Ident, 0); + + // Update the shared variables if necessary, that is if we did not use user + // memory in the first . + if (!SharedMemPointers && SharedVarsBytes) + memcpy(SharedVars, __kmpc_target_region_kernel_get_shared_memory(), + SharedVarsBytes); + + // We could set (or reset) the _shared_bytes_buffer_memory pointer to NULL (or + // the old value) if we used user provided memory. This is not necessary as + // long as the buffer knows not to free the explicitly "set" pointer. +}