diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt --- a/openmp/libomptarget/CMakeLists.txt +++ b/openmp/libomptarget/CMakeLists.txt @@ -78,6 +78,7 @@ # Build offloading plugins and device RTLs if they are available. add_subdirectory(plugins) add_subdirectory(deviceRTLs) +add_subdirectory(DeviceRTL) # Add tests. add_subdirectory(test) diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -0,0 +1,208 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build the Device RTL for all toolchains that are available +# +##===----------------------------------------------------------------------===## + +# TODO: copied from NVPTX, need to be generalized. + +# By default we will not build NVPTX deviceRTL on a CUDA free system +set(LIBOMPTARGET_BUILD_NVPTX_BCLIB FALSE CACHE BOOL + "Whether build NVPTX deviceRTL on CUDA free system.") + +if (NOT (LIBOMPTARGET_DEP_CUDA_FOUND OR LIBOMPTARGET_BUILD_NVPTX_BCLIB)) + libomptarget_say("Not building NVPTX deviceRTL by default on CUDA free system.") + return() +endif() + +# Check if we can create an LLVM bitcode implementation of the runtime library +# that could be inlined in the user application. For that we need to find +# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and +# an LLVM linker. +set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING + "Location of a CUDA compiler capable of emitting LLVM bitcode.") +set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING + "Location of a linker capable of linking LLVM bitcode objects.") + +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") + set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) +elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(cuda_compiler ${CMAKE_C_COMPILER}) +else() + libomptarget_say("Not building NVPTX deviceRTL: clang not found") + return() +endif() + +# Get compiler directory to try to locate a suitable linker. +get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY) +set(llvm_link "${compiler_dir}/llvm-link") +set(opt "${compiler_dir}/opt") + +if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "") + set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER}) +elseif (EXISTS ${llvm_link}) + set(bc_linker ${llvm_link}) +else() + libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found") + return() +endif() + +# TODO: This part needs to be refined when libomptarget is going to support +# Windows! +# TODO: This part can also be removed if we can change the clang driver to make +# it support device only compilation. +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") + set(aux_triple x86_64-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le") + set(aux_triple powerpc64le-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(aux_triple aarch64-unknown-linux-gnu) +else() + libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}") + return() +endif() + +set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR}) +set(include_directory ${devicertl_base_directory}/include) +set(source_directory ${devicertl_base_directory}/src) + +set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80) + +set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${all_capabilities} CACHE STRING + "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.") +string(TOLOWER ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES} LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES) + +if (LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "all") + set(nvptx_sm_list ${all_capabilities}) +elseif(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "auto") + if (NOT LIBOMPTARGET_DEP_CUDA_FOUND) + libomptarget_error_say("[NVPTX] Cannot auto detect compute capability as CUDA not found.") + endif() + set(nvptx_sm_list ${LIBOMPTARGET_DEP_CUDA_ARCH}) +else() + string(REPLACE "," ";" nvptx_sm_list "${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}") +endif() + +# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the +# build. +if (NOT nvptx_sm_list) + libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list") + return() +endif() + +# Check all SM values +foreach(sm ${nvptx_sm_list}) + if (NOT ${sm} IN_LIST all_capabilities) + libomptarget_warning_say("[NVPTX] Compute capability ${sm} is not supported. Make sure clang can work with it.") + endif() +endforeach() + +# Override default MAX_SM in src/target_impl.h if requested +if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM) + set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}") +endif() + +# Activate RTL message dumps if requested by the user. +set(LIBOMPTARGET_DEVICE_DEBUG FALSE CACHE BOOL + "Activate NVPTX device RTL debug messages.") + +libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.") + +set(src_files + ${source_directory}/Configuration.cpp + ${source_directory}/Debug.cpp + ${source_directory}/Kernel.cpp + ${source_directory}/Mapping.cpp + ${source_directory}/Misc.cpp + ${source_directory}/Parallelism.cpp + ${source_directory}/Reduction.cpp + ${source_directory}/State.cpp + ${source_directory}/Synchronization.cpp + ${source_directory}/Tasking.cpp + ${source_directory}/Utils.cpp + ${source_directory}/Workshare.cpp +) + +set(clang_opt_flags -O1 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=2048) +set(link_opt_flags -O1 -openmp-opt-disable) + +# Set flags for LLVM Bitcode compilation. +set(bc_flags -S -x c++ -std=c++17 + ${clang_opt_flags} + -target nvptx64 + -Xclang -emit-llvm-bc + -Xclang -aux-triple -Xclang ${aux_triple} + -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device + -Xclang -target-feature -Xclang +ptx61 + -I${include_directory} +) + +if(${LIBOMPTARGET_DEVICE_DEBUG}) + list(APPEND bc_flags -DOMPTARGET_DEBUG=-1) +else() + list(APPEND bc_flags -DOMPTARGET_DEBUG=0) +endif() + +# Create target to build all Bitcode libraries. +add_custom_target(omptarget-new-nvptx-bc) + +# Generate a Bitcode library for all the compute capabilities the user requested +foreach(sm ${nvptx_sm_list}) + # TODO: replace this with declare variant and isa selector. + set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0") + set(bc_files "") + foreach(src ${src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + set(outfile "${outfile}-sm_${sm}.bc") + + add_custom_command(OUTPUT ${outfile} + COMMAND ${cuda_compiler} ${bc_flags} + ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile} + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}" + VERBATIM + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) + + list(APPEND bc_files ${outfile}) + endforeach() + + set(bclib_name "libomptarget-new-nvptx-sm_${sm}.bc") + + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + COMMAND ${bc_linker} + -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode ${bclib_name}" + ) + + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}_opt + COMMAND ${opt} ${link_opt_flags} ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + COMMENT "Optimizing LLVM bitcode ${bclib_name}" + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) + + set(bclib_target_name "omptarget-new-nvptx-sm_${sm}-bc") + + add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}_opt) + add_dependencies(omptarget-new-nvptx-bc ${bclib_target_name}) + + # Copy library to destination. + add_custom_command(TARGET ${bclib_target_name} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + ${LIBOMPTARGET_LIBRARY_DIR}) + + # Install bitcode library under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") +endforeach() diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h @@ -0,0 +1,35 @@ +//===--- Configuration.h - OpenMP device configuration interface -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// API to query the global (constant) device environment. +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_CONFIGURATION_H +#define OMPTARGET_CONFIGURATION_H + +#include "Types.h" + +namespace _OMP { +namespace config { + +enum DebugLevel : int32_t { Assertion }; + +/// Return the number of devices in the system, same number as returned on the +/// host by omp_get_num_devices. +uint32_t getNumDevices(); + +/// Return the user choosen debug level. +int32_t getDebugLevel(); + +bool isDebugMode(DebugLevel Level); + +} // namespace config +} // namespace _OMP + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/Debug.h b/openmp/libomptarget/DeviceRTL/include/Debug.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Debug.h @@ -0,0 +1,30 @@ +//===-------- Debug.h ---- Debug utilities ------------------------ C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_DEBUG_H +#define OMPTARGET_DEVICERTL_DEBUG_H + +/// Assertion +/// +/// { +extern "C" { +void __assert_assume(bool cond, const char *exp, const char *file, int line); +} + +#define ASSERT(e) __assert_assume(e, #e, __FILE__, __LINE__) + +///} + +// TODO: We need to allow actual printf. +#define PRINTF(fmt, ...) (void)fmt; +#define PRINT(str) PRINTF("%s", str) + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -0,0 +1,345 @@ +//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_INTERFACE_H +#define OMPTARGET_DEVICERTL_INTERFACE_H + +#include "Types.h" + +/// External API +/// +///{ + +extern "C" { + +/// ICV: dyn-var, constant 0 +/// +/// setter: ignored. +/// getter: returns 0. +/// +///{ +void omp_set_dynamic(int); +int omp_get_dynamic(void); +///} + +/// ICV: nthreads-var, integer +/// +/// scope: data environment +/// +/// setter: ignored. +/// getter: returns false. +/// +/// implementation notes: +/// +/// +///{ +void omp_set_num_threads(int); +int omp_get_max_threads(void); +///} + +/// ICV: thread-limit-var, computed +/// +/// getter: returns thread limited defined during launch. +/// +///{ +int omp_get_thread_limit(void); +///} + +/// ICV: max-active-level-var, constant 1 +/// +/// setter: ignored. +/// getter: returns 1. +/// +///{ +void omp_set_max_active_levels(int); +int omp_get_max_active_levels(void); +///} + +/// ICV: places-partition-var +/// +/// +///{ +///} + +/// ICV: active-level-var, 0 or 1 +/// +/// getter: returns 0 or 1. +/// +///{ +int omp_get_active_level(void); +///} + +/// ICV: level-var +/// +/// getter: returns parallel region nesting +/// +///{ +int omp_get_level(void); +///} + +/// ICV: run-sched-var +/// +/// +///{ +void omp_set_schedule(omp_sched_t, int); +void omp_get_schedule(omp_sched_t *, int *); +///} + +/// TODO this is incomplete. +int omp_get_num_threads(void); +int omp_get_thread_num(void); +void omp_set_nested(int); + +int omp_get_nested(void); + +void omp_set_max_active_levels(int Level); + +int omp_get_max_active_levels(void); + +omp_proc_bind_t omp_get_proc_bind(void); + +int omp_get_num_places(void); + +int omp_get_place_num_procs(int place_num); + +void omp_get_place_proc_ids(int place_num, int *ids); + +int omp_get_place_num(void); + +int omp_get_partition_num_places(void); + +void omp_get_partition_place_nums(int *place_nums); + +int omp_get_cancellation(void); + +void omp_set_default_device(int deviceId); + +int omp_get_default_device(void); + +int omp_get_num_devices(void); + +int omp_get_num_teams(void); + +int omp_get_team_num(); + +int omp_get_initial_device(void); + +/// Synchronization +/// +///{ +void omp_init_lock(omp_lock_t *Lock); + +void omp_destroy_lock(omp_lock_t *Lock); + +void omp_set_lock(omp_lock_t *Lock); + +void omp_unset_lock(omp_lock_t *Lock); + +int omp_test_lock(omp_lock_t *Lock); +///} + +/// Tasking +/// +///{ +int omp_in_final(void); + +int omp_get_max_task_priority(void); +///} + +/// Misc +/// +///{ +double omp_get_wtick(void); + +double omp_get_wtime(void); +///} +} + +extern "C" { +/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be +/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be +/// called by any thread, allocation happens *per thread*. +void *__kmpc_alloc_shared(uint64_t Bytes); + +/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like +/// a stack (push/pop). Can be called by any thread. \p Ptr has to be the +/// allocated by __kmpc_alloc_shared by the same thread. +void __kmpc_free_shared(void *Ptr, uint64_t Bytes); + +/// Allocate sufficient space for \p NumArgs sequential `void*` and store the +/// allocation address in \p GlobalArgs. +/// +/// Called by the main thread prior to a parallel region. +/// +/// We also remember it in GlobalArgsPtr to ensure the worker threads and +/// deallocation function know the allocation address too. +void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs); + +/// Deallocate the memory allocated by __kmpc_begin_sharing_variables. +/// +/// Called by the main thread after a parallel region. +void __kmpc_end_sharing_variables(void **GlobalArgs, uint64_t NumArgs); + +/// Store the allocation address obtained via __kmpc_begin_sharing_variables in +/// \p GlobalArgs. +/// +/// Called by the worker threads in the parallel region (function). +void __kmpc_get_shared_variables(void ***GlobalArgs); + +/// Kernel +/// +///{ +int8_t __kmpc_is_spmd_exec_mode(); + +int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD, + bool UseGenericStateMachine, bool); + +void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool); + +///} + +/// Reduction +/// +///{ +void __kmpc_nvptx_end_reduce(int32_t TId); + +void __kmpc_nvptx_end_reduce_nowait(int32_t TId); + +int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size, + void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct); + +int32_t __kmpc_nvptx_teams_reduce_nowait_v2( + IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records, + void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct, + ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct, + ListGlobalFnTy glredFct); +///} + +/// Synchronization +/// +///{ +void __kmpc_ordered(IdentTy *Loc, int32_t TId); + +void __kmpc_end_ordered(IdentTy *Loc, int32_t TId); + +int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId); + +void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId); + +void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId); + +int32_t __kmpc_master(IdentTy *Loc, int32_t TId); + +void __kmpc_end_master(IdentTy *Loc, int32_t TId); + +int32_t __kmpc_single(IdentTy *Loc, int32_t TId); + +void __kmpc_end_single(IdentTy *Loc, int32_t TId); + +void __kmpc_flush(IdentTy *Loc); + +__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask(); + +void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask); + +void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); + +void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); +///} + +/// Parallelism +/// +///{ +/// TODO +void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn); + +/// TODO +bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn); + +/// TODO +void __kmpc_kernel_end_parallel(); + +/// TODO +void __kmpc_serialized_parallel(IdentTy *Loc, uint32_t); + +/// TODO +void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t); + +/// TODO +void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind); + +/// TODO +void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams, + int32_t ThreadLimit); + +/// TODO +uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t); + +/// TODO +void __kmpc_push_num_threads(IdentTy *Loc, int32_t, int32_t NumThreads); +///} + +/// Tasking +/// +///{ +TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t, + uint32_t TaskSizeInclPrivateValues, + uint32_t SharedValuesSize, + TaskFnTy TaskFn); + +int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor); + +int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor, int32_t, + void *, int32_t, void *); + +void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor); + +void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor); + +void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, + void *); + +void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId); + +void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId); + +int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int); + +int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId); + +void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor, int, + uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, + int32_t, uint64_t, void *); +///} + +/// Misc +/// +///{ +int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal); + +int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal); +///} + +/// Shuffle +/// +///{ +int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); +int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); +///} +} + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h @@ -0,0 +1,86 @@ +//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_MAPPING_H +#define OMPTARGET_MAPPING_H + +#include "Types.h" + +namespace _OMP { + +namespace mapping { + +#pragma omp declare target + +inline constexpr uint32_t MaxThreadsPerTeam = 1024; + +#pragma omp end declare target + +/// Initialize the mapping machinery. +void init(bool IsSPMD); + +/// Return true if the kernel is executed in SPMD mode. +bool isSPMDMode(); + +/// Return true if the kernel is executed in generic mode. +bool isGenericMode(); + +/// Return true if the executing thread is the main thread in generic mode. +bool isMainThreadInGenericMode(); + +/// Return true if the executing thread has the lowest Id of the active threads +/// in the warp. +bool isLeaderInWarp(); + +/// Return a mask describing all active threads in the warp. +LaneMaskTy activemask(); + +/// Return a mask describing all threads with a smaller Id in the warp. +LaneMaskTy lanemaskLT(); + +/// Return a mask describing all threads with a larget Id in the warp. +LaneMaskTy lanemaskGT(); + +/// Return the thread Id in the warp, in [0, getWarpSize()). +uint32_t getThreadIdInWarp(); + +/// Return the thread Id in the block, in [0, getBlockSize()). +uint32_t getThreadIdInBlock(); + +/// Return the warp id in the block. +uint32_t getWarpId(); + +/// Return the warp size, thus number of threads in the warp. +uint32_t getWarpSize(); + +/// Return the number of warps in the block. +uint32_t getNumberOfWarpsInBlock(); + +/// Return the block Id in the kernel, in [0, getKernelSize()). +uint32_t getBlockId(); + +/// Return the block size, thus number of threads in the block. +uint32_t getBlockSize(); + +/// Return the number of blocks in the kernel. +uint32_t getNumberOfBlocks(); + +/// Return the kernel size, thus number of threads in the kernel. +uint32_t getKernelSize(); + +/// Return the number of processing elements on the device. +uint32_t getNumberOfProcessorElements(); + +} // namespace mapping + +} // namespace _OMP + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/State.h @@ -0,0 +1,200 @@ +//===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_STATE_H +#define OMPTARGET_STATE_H + +#include "Debug.h" +#include "Types.h" + +#pragma omp declare target + +namespace _OMP { + +namespace state { + +inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE; + +/// Initialize the state machinery. Must be called by all threads. +void init(bool IsSPMD); + +/// TODO +enum ValueKind { + VK_NThreads, + VK_Level, + VK_ActiveLevel, + VK_MaxActiveLevels, + VK_RunSched, + // --- + VK_RunSchedChunk, + VK_ParallelRegionFn, + VK_ParallelTeamSize, +}; + +/// TODO +void enterDataEnvironment(); + +/// TODO +void exitDataEnvironment(); + +/// TODO +struct DateEnvironmentRAII { + DateEnvironmentRAII() { enterDataEnvironment(); } + ~DateEnvironmentRAII() { exitDataEnvironment(); } +}; + +/// TODO +void resetStateForThread(uint32_t TId); + +uint32_t &lookup32(ValueKind VK, bool IsReadonly); +void *&lookupPtr(ValueKind VK, bool IsReadonly); + +/// A class without actual state used to provide a nice interface to lookup and +/// update ICV values we can declare in global scope. +template struct Value { + __attribute__((flatten, always_inline)) operator Ty() { + return lookup(/* IsReadonly */ true); + } + + __attribute__((flatten, always_inline)) Value &operator=(const Ty &Other) { + set(Other); + return *this; + } + + __attribute__((flatten, always_inline)) Value &operator++() { + inc(1); + return *this; + } + + __attribute__((flatten, always_inline)) Value &operator--() { + inc(-1); + return *this; + } + +private: + Ty &lookup(bool IsReadonly) { + Ty &t = lookup32(Kind, IsReadonly); + return t; + } + + Ty &inc(int UpdateVal) { + return (lookup(/* IsReadonly */ false) += UpdateVal); + } + + Ty &set(Ty UpdateVal) { return (lookup(/* IsReadonly */ false) = UpdateVal); } + + template friend struct ValueRAII; +}; + +/// A mookup class without actual state used to provide +/// a nice interface to lookup and update ICV values +/// we can declare in global scope. +template struct PtrValue { + __attribute__((flatten, always_inline)) operator Ty() { + return lookup(/* IsReadonly */ true); + } + + __attribute__((flatten, always_inline)) PtrValue &operator=(const Ty Other) { + set(Other); + return *this; + } + +private: + Ty &lookup(bool IsReadonly) { return lookupPtr(Kind, IsReadonly); } + + Ty &set(Ty UpdateVal) { return (lookup(/* IsReadonly */ false) = UpdateVal); } + + template friend struct ValueRAII; +}; + +template struct ValueRAII { + ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active) + : Ptr(Active ? V.lookup(/* IsReadonly */ false) : Val), Val(OldValue), + Active(Active) { + if (!Active) + return; + ASSERT(Ptr == OldValue && "ValueRAII initialization with wrong old value!"); + Ptr = NewValue; + } + ~ValueRAII() { + if (Active) + Ptr = Val; + } + +private: + Ty &Ptr; + Ty Val; + bool Active; +}; + +/// TODO +inline state::Value RunSchedChunk; + +/// TODO +inline state::Value ParallelTeamSize; + +/// TODO +inline state::PtrValue + ParallelRegionFn; + +void runAndCheckState(void(Func(void))); + +void assumeInitialState(bool IsSPMD); + +} // namespace state + +namespace icv { + +/// TODO +inline state::Value NThreads; + +/// TODO +inline state::Value Level; + +/// The `active-level` describes which of the parallel level counted with the +/// `level-var` is active. There can only be one. +/// +/// active-level-var is 1, if ActiveLevelVar is not 0, otherweise it is 0. +inline state::Value ActiveLevel; + +/// TODO +inline state::Value MaxActiveLevels; + +/// TODO +inline state::Value RunSched; + +} // namespace icv + +namespace memory { + +/// Alloca \p Size bytes in shared memory, if possible, for \p Reason. +/// +/// Note: See the restrictions on __kmpc_alloc_shared for proper usage. +void *allocShared(uint64_t Size, const char *Reason); + +/// Free \p Ptr, alloated via allocShared, for \p Reason. +/// +/// Note: See the restrictions on __kmpc_free_shared for proper usage. +void freeShared(void *Ptr, uint64_t Bytes, const char *Reason); + +/// Alloca \p Size bytes in global memory, if possible, for \p Reason. +void *allocGlobal(uint64_t Size, const char *Reason); + +/// Free \p Ptr, alloated via allocGlobal, for \p Reason. +void freeGlobal(void *Ptr, const char *Reason); + +} // namespace memory + +} // namespace _OMP + +#pragma omp end declare target + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -0,0 +1,69 @@ +//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H +#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H + +#include "Types.h" + +namespace _OMP { + +namespace synchronize { + +/// Initialize the synchronization machinery. Must be called by all threads. +void init(bool IsSPMD); + +/// Synchronize all threads in a warp identified by \p Mask. +void warp(LaneMaskTy Mask); + +/// Synchronize all threads in a block. +void threads(); + +} // namespace synchronize + +namespace fence { + +/// Memory fence with \p Ordering semantics for the team. +void team(int Ordering); + +/// Memory fence with \p Ordering semantics for the contention group. +void kernel(int Ordering); + +/// Memory fence with \p Ordering semantics for the system. +void system(int Ordering); + +} // namespace fence + +namespace atomic { + +/// Atomically read \p Addr with \p Ordering semantics. +uint32_t read(uint32_t *Addr, int Ordering); + +/// Atomically store \p V to \p Addr with \p Ordering semantics. +uint32_t store(uint32_t *Addr, uint32_t V, int Ordering); + +/// Atomically store \p V to \p Addr with \p Ordering semantics. +uint64_t store(uint64_t *Addr, uint64_t V, int Ordering); + +/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. +uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering); + +/// Atomically add \p V to \p *Addr with \p Ordering semantics. +uint32_t add(uint32_t *Addr, uint32_t V, int Ordering); + +/// Atomically add \p V to \p *Addr with \p Ordering semantics. +uint64_t add(uint64_t *Addr, uint64_t V, int Ordering); + +} // namespace atomic + +} // namespace _OMP + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/Types.h b/openmp/libomptarget/DeviceRTL/include/Types.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Types.h @@ -0,0 +1,200 @@ +//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_TYPES_H +#define OMPTARGET_TYPES_H + +/// Base type declarations for freestanding mode +/// +///{ +using int8_t = char; +using uint8_t = unsigned char; +using int16_t = short; +using uint16_t = unsigned short; +using int32_t = int; +using uint32_t = unsigned int; +using int64_t = long; +using uint64_t = unsigned long; + +static_assert(sizeof(int8_t) == 1, "type size mismatch"); +static_assert(sizeof(uint8_t) == 1, "type size mismatch"); +static_assert(sizeof(int16_t) == 2, "type size mismatch"); +static_assert(sizeof(uint16_t) == 2, "type size mismatch"); +static_assert(sizeof(int32_t) == 4, "type size mismatch"); +static_assert(sizeof(uint32_t) == 4, "type size mismatch"); +static_assert(sizeof(int64_t) == 8, "type size mismatch"); +static_assert(sizeof(uint64_t) == 8, "type size mismatch"); +///} + +enum omp_proc_bind_t { + omp_proc_bind_false = 0, + omp_proc_bind_true = 1, + omp_proc_bind_master = 2, + omp_proc_bind_close = 3, + omp_proc_bind_spread = 4 +}; + +enum omp_sched_t { + omp_sched_static = 1, /* chunkSize >0 */ + omp_sched_dynamic = 2, /* chunkSize >0 */ + omp_sched_guided = 3, /* chunkSize >0 */ + omp_sched_auto = 4, /* no chunkSize */ +}; + +enum kmp_sched_t { + kmp_sched_static_chunk = 33, + kmp_sched_static_nochunk = 34, + kmp_sched_dynamic = 35, + kmp_sched_guided = 36, + kmp_sched_runtime = 37, + kmp_sched_auto = 38, + + kmp_sched_static_balanced_chunk = 45, + + kmp_sched_static_ordered = 65, + kmp_sched_static_nochunk_ordered = 66, + kmp_sched_dynamic_ordered = 67, + kmp_sched_guided_ordered = 68, + kmp_sched_runtime_ordered = 69, + kmp_sched_auto_ordered = 70, + + kmp_sched_distr_static_chunk = 91, + kmp_sched_distr_static_nochunk = 92, + kmp_sched_distr_static_chunk_sched_static_chunkone = 93, + + kmp_sched_default = kmp_sched_static_nochunk, + kmp_sched_unordered_first = kmp_sched_static_chunk, + kmp_sched_unordered_last = kmp_sched_auto, + kmp_sched_ordered_first = kmp_sched_static_ordered, + kmp_sched_ordered_last = kmp_sched_auto_ordered, + kmp_sched_distribute_first = kmp_sched_distr_static_chunk, + kmp_sched_distribute_last = + kmp_sched_distr_static_chunk_sched_static_chunkone, + + /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. + * Since we need to distinguish the three possible cases (no modifier, + * monotonic modifier, nonmonotonic modifier), we need separate bits for + * each modifier. The absence of monotonic does not imply nonmonotonic, + * especially since 4.5 says that the behaviour of the "no modifier" case + * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0. + * + * Since we're passing a full 32 bit value, we can use a couple of high + * bits for these flags; out of paranoia we avoid the sign bit. + * + * These modifiers can be or-ed into non-static schedules by the compiler + * to pass the additional information. They will be stripped early in the + * processing in __kmp_dispatch_init when setting up schedules, so + * most of the code won't ever see schedules with these bits set. + */ + kmp_sched_modifier_monotonic = (1 << 29), + /**< Set if the monotonic schedule modifier was present */ + kmp_sched_modifier_nonmonotonic = (1 << 30), +/**< Set if the nonmonotonic schedule modifier was present */ + +#define SCHEDULE_WITHOUT_MODIFIERS(s) \ + (enum kmp_sched_t)( \ + (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) +#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0) +#define SCHEDULE_HAS_NONMONOTONIC(s) \ + (((s)&kmp_sched_modifier_nonmonotonic) != 0) +#define SCHEDULE_HAS_NO_MODIFIERS(s) \ + (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \ + 0) + +}; + +struct TaskDescriptorTy; +using TaskFnTy = int32_t (*)(int32_t global_tid, TaskDescriptorTy *taskDescr); +struct TaskDescriptorTy { + void *Payload; + TaskFnTy TaskFn; +}; + +#pragma omp begin declare variant match(device = {arch(amdgcn)}) +using LaneMaskTy = uint64_t; +#pragma omp end declare variant + +#pragma omp begin declare variant match( \ + device = {arch(amdgcn)}, implementation = {extension(match_none)}) +using LaneMaskTy = uint64_t; +#pragma omp end declare variant + +namespace lanes { +enum : LaneMaskTy { All = ~(LaneMaskTy)0 }; +} // namespace lanes + +/// The ident structure that describes a source location. The struct is +/// identical to the one in the kmp.h file. We maintain the same data structure +/// for compatibility. +struct IdentTy { + int32_t reserved_1; /**< might be used in Fortran; see above */ + int32_t flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC + identifies this union member */ + int32_t reserved_2; /**< not really used in Fortran any more; see above */ + int32_t reserved_3; /**< source[4] in Fortran, do not use for C++ */ + char const *psource; /**< String describing the source location. + The string is composed of semi-colon separated fields + which describe the source file, the function and a pair + of line numbers that delimit the construct. */ +}; + +using __kmpc_impl_lanemask_t = LaneMaskTy; + +using ParallelRegionFnTy = void *; + +using CriticalNameTy = int32_t[8]; + +struct omp_lock_t { + void *Lock; +}; + +using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num); +using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id, + int16_t lane_offset, int16_t shortCircuit); +using ListGlobalFnTy = void (*)(void *buffer, int idx, void *reduce_data); + +/// Macros for allocating variables in different address spaces. +///{ + +// Follows the pattern in interface.h +typedef enum omp_allocator_handle_t { + omp_null_allocator = 0, + omp_default_mem_alloc = 1, + omp_large_cap_mem_alloc = 2, + omp_const_mem_alloc = 3, + omp_high_bw_mem_alloc = 4, + omp_low_lat_mem_alloc = 5, + omp_cgroup_mem_alloc = 6, + omp_pteam_mem_alloc = 7, + omp_thread_mem_alloc = 8, + KMP_ALLOCATOR_MAX_HANDLE = ~(0U) +} omp_allocator_handle_t; + +#define __PRAGMA(STR) _Pragma(#STR) +#define OMP_PRAGMA(STR) __PRAGMA(omp STR) + +#define SHARED(NAME) \ + NAME [[clang::loader_uninitialized]]; \ + OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc)) + +// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right +// now that's not the case. +#define THREAD_LOCAL(NAME) \ + NAME [[clang::loader_uninitialized, clang::address_space(5)]] + +// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it +// does? +#define CONSTANT(NAME) \ + NAME [[clang::loader_uninitialized, clang::address_space(4)]] + +///} + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/Utils.h b/openmp/libomptarget/DeviceRTL/include/Utils.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Utils.h @@ -0,0 +1,72 @@ +//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_UTILS_H +#define OMPTARGET_DEVICERTL_UTILS_H + +#include "Types.h" + +namespace _OMP { +namespace utils { + +/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread +/// is identified by \p Mask. +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane); + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width); + +/// Return \p LowBits and \p HighBits packed into a single 64 bit value. +uint64_t pack(uint32_t LowBits, uint32_t HighBits); + +/// Unpack \p Val into \p LowBits and \p HighBits. +void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); + +/// Round up \p V to a \p Boundary. +template inline Ty roundUp(Ty V, Ty Boundary) { + return (V + Boundary - 1) / Boundary * Boundary; +} + +/// Advance \p Ptr by \p Bytes bytes. +template inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) { + return reinterpret_cast(reinterpret_cast(Ptr) + Bytes); +} + +/// Return the first bit set in \p V. +inline uint32_t ffs(uint32_t V) { + static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); + return __builtin_ffs(V); +} + +/// Return the first bit set in \p V. +inline uint32_t ffs(uint64_t V) { + static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); + return __builtin_ffsl(V); +} + +/// Return the number of bits set in \p V. +inline uint32_t popc(uint32_t V) { + static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch"); + return __builtin_popcount(V); +} + +/// Return the number of bits set in \p V. +inline uint32_t popc(uint64_t V) { + static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch"); + return __builtin_popcountl(V); +} + +#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true) +#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false) + +} // namespace utils +} // namespace _OMP + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/generated_microtask_cases.gen b/openmp/libomptarget/DeviceRTL/include/generated_microtask_cases.gen new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/generated_microtask_cases.gen @@ -0,0 +1,405 @@ +case 0: +((void (*)(int32_t *, int32_t * +))fn)(&global_tid, &bound_tid +); +break; +case 1: +((void (*)(int32_t *, int32_t * +, void *))fn)(&global_tid, &bound_tid +, args[0]); +break; +case 2: +((void (*)(int32_t *, int32_t * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1]); +break; +case 3: +((void (*)(int32_t *, int32_t * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2]); +break; +case 4: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +); +break; +case 5: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4]); +break; +case 6: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5]); +break; +case 7: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6]); +break; +case 8: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +); +break; +case 9: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8]); +break; +case 10: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9]); +break; +case 11: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10]); +break; +case 12: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +); +break; +case 13: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12]); +break; +case 14: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13]); +break; +case 15: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14]); +break; +case 16: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +); +break; +case 17: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16]); +break; +case 18: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17]); +break; +case 19: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18]); +break; +case 20: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +); +break; +case 21: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20]); +break; +case 22: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21]); +break; +case 23: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22]); +break; +case 24: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +); +break; +case 25: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24]); +break; +case 26: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25]); +break; +case 27: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25], args[26]); +break; +case 28: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25], args[26], args[27] +); +break; +case 29: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25], args[26], args[27] +, args[28]); +break; +case 30: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25], args[26], args[27] +, args[28], args[29]); +break; +case 31: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25], args[26], args[27] +, args[28], args[29], args[30]); +break; +case 32: +((void (*)(int32_t *, int32_t * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +, void *, void *, void *, void * +))fn)(&global_tid, &bound_tid +, args[0], args[1], args[2], args[3] +, args[4], args[5], args[6], args[7] +, args[8], args[9], args[10], args[11] +, args[12], args[13], args[14], args[15] +, args[16], args[17], args[18], args[19] +, args[20], args[21], args[22], args[23] +, args[24], args[25], args[26], args[27] +, args[28], args[29], args[30], args[31] +); +break; diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -0,0 +1,44 @@ +//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the data object of the constant device environment and the +// query API. +// +//===----------------------------------------------------------------------===// + +#include "Configuration.h" +#include "State.h" +#include "Types.h" + +using namespace _OMP; + +struct DeviceEnvironmentTy { + int32_t DebugLevel; +}; + +#pragma omp declare target + +// TOOD: We want to change the name as soon as the old runtime is gone. +DeviceEnvironmentTy CONSTANT(omptarget_device_environment) + __attribute__((used)); + +int32_t config::getDebugLevel() { + // TODO: Implement libomptarget initialization of DeviceEnvironmentTy + return 0; +} + +uint32_t config::getNumDevices() { + // TODO: Implement libomptarget initialization of DeviceEnvironmentTy + return 1; +} + +bool config::isDebugMode(config::DebugLevel Level) { + return config::getDebugLevel() > Level; +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp @@ -0,0 +1,31 @@ +//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains debug utilities +// +//===----------------------------------------------------------------------===// + +#include "Debug.h" +#include "Configuration.h" + +using namespace _OMP; + +#pragma omp declare target + +extern "C" { +void __assert_assume(bool cond, const char *exp, const char *file, int line) { + if (!cond && config::isDebugMode(config::DebugLevel::Assertion)) { + PRINTF("ASSERTION failed: %s at %s, line %d\n", exp, file, line); + __builtin_trap(); + } + + __builtin_assume(cond); +} +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -0,0 +1,111 @@ +//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the kernel entry points for the device. +// +//===----------------------------------------------------------------------===// + +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "State.h" +#include "Synchronization.h" +#include "Types.h" + +using namespace _OMP; + +#pragma omp declare target + +static void inititializeRuntime(bool IsSPMD) { + // Order is important here. + synchronize::init(IsSPMD); + mapping::init(IsSPMD); + state::init(IsSPMD); +} + +/// Simple generic state machine for worker threads. +static void genericStateMachine(IdentTy *Ident) { + + uint32_t TId = mapping::getThreadIdInBlock(); + + do { + ParallelRegionFnTy WorkFn = 0; + + // Wait for the signal that we have a new work function. + synchronize::threads(); + + // Retrieve the work function from the runtime. + bool IsActive = __kmpc_kernel_parallel(&WorkFn); + + // If there is nothing more to do, break out of the state machine by + // returning to the caller. + if (!WorkFn) + return; + + if (IsActive) { + ASSERT(!mapping::isSPMDMode()); + ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId); + __kmpc_kernel_end_parallel(); + } + + synchronize::threads(); + + } while (true); +} + +extern "C" { + +/// Initialization +/// +/// \param Ident Source location identification, can be NULL. +/// +int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD, + bool UseGenericStateMachine, bool) { + if (IsSPMD) { + inititializeRuntime(/* IsSPMD */ true); + synchronize::threads(); + } else { + inititializeRuntime(/* IsSPMD */ false); + // No need to wait since only the main threads will execute user + // code and workers will run into a barrier right away. + } + + if (IsSPMD) { + state::assumeInitialState(IsSPMD); + return -1; + } + + if (mapping::isMainThreadInGenericMode()) + return -1; + + if (UseGenericStateMachine) + genericStateMachine(Ident); + + return mapping::getThreadIdInBlock(); +} + +/// De-Initialization +/// +/// In non-SPMD, this function releases the workers trapped in a state machine +/// and also any memory dynamically allocated by the runtime. +/// +/// \param Ident Source location identification, can be NULL. +/// +void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool) { + state::assumeInitialState(IsSPMD); + if (IsSPMD) + return; + + // Signal the workers to exit the state machine and exit the kernel. + state::ParallelRegionFn = nullptr; +} + +int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); } +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -0,0 +1,221 @@ +//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "Mapping.h" +#include "State.h" +#include "Types.h" +#include "Utils.h" + +#pragma omp declare target + +using namespace _OMP; + +namespace _OMP { +namespace impl { + +/// AMDGCN Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +uint32_t getGridDim(uint32_t n, uint16_t d) { + uint32_t q = n / d; + return q + (n > q * d); +} + +uint32_t getWorkgroupDim(uint32_t group_id, uint32_t grid_size, + uint16_t group_size) { + uint32_t r = grid_size - group_id * group_size; + return (r < group_size) ? r : group_size; +} + +LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } + +LaneMaskTy lanemaskLT() { + uint32_t Lane = mapping::getThreadIdInWarp(); + int64_t Ballot = mapping::activemask(); + uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; + return Mask & Ballot; +} + +LaneMaskTy lanemaskGT() { + uint32_t Lane = mapping::getThreadIdInWarp(); + if (Lane == (mapping::getWarpSize() - 1)) + return 0; + int64_t Ballot = mapping::activemask(); + uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); + return Mask & Ballot; +} + +uint32_t getThreadIdInWarp() { + return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); +} + +uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } + +uint32_t getBlockSize() { + // TODO: verify this logic for generic mode. + return getWorkgroupDim(__builtin_amdgcn_workgroup_id_x(), + __builtin_amdgcn_grid_size_x(), + __builtin_amdgcn_workgroup_size_x()); +} + +uint32_t getKernelSize() { return __builtin_amdgcn_grid_size_x(); } + +uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); } + +uint32_t getNumberOfBlocks() { + return getGridDim(__builtin_amdgcn_grid_size_x(), + __builtin_amdgcn_workgroup_size_x()); +} + +uint32_t getNumberOfProcessorElements() { + // TODO + return mapping::getBlockSize(); +} + +uint32_t getWarpId() { + return mapping::getThreadIdInBlock() / mapping::getWarpSize(); +} + +uint32_t getWarpSize() { return 64; } + +uint32_t getNumberOfWarpsInBlock() { + return mapping::getBlockSize() / mapping::getWarpSize(); +} + +#pragma omp end declare variant +///} + +/// NVPTX Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +LaneMaskTy activemask() { + unsigned int Mask; + asm("activemask.b32 %0;" : "=r"(Mask)); + return Mask; +} + +LaneMaskTy lanemaskLT() { + __kmpc_impl_lanemask_t Res; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res)); + return Res; +} + +LaneMaskTy lanemaskGT() { + __kmpc_impl_lanemask_t Res; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res)); + return Res; +} + +uint32_t getThreadIdInWarp() { + return mapping::getThreadIdInBlock() & (mapping::getWarpSize() - 1); +} + +uint32_t getThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } + +uint32_t getBlockSize() { + return __nvvm_read_ptx_sreg_ntid_x() - + (!mapping::isSPMDMode() * mapping::getWarpSize()); +} + +uint32_t getKernelSize() { return __nvvm_read_ptx_sreg_nctaid_x(); } + +uint32_t getBlockId() { return __nvvm_read_ptx_sreg_ctaid_x(); } + +uint32_t getNumberOfBlocks() { return __nvvm_read_ptx_sreg_nctaid_x(); } + +uint32_t getNumberOfProcessorElements() { + return __nvvm_read_ptx_sreg_ntid_x(); +} + +uint32_t getWarpId() { + return mapping::getThreadIdInBlock() / mapping::getWarpSize(); +} + +uint32_t getWarpSize() { return 32; } + +uint32_t getNumberOfWarpsInBlock() { + return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / + mapping::getWarpSize(); +} + +#pragma omp end declare variant +///} + +} // namespace impl +} // namespace _OMP + +bool mapping::isMainThreadInGenericMode() { + if (mapping::isSPMDMode() || icv::Level) + return false; + + // Check if this is the last warp in the block. + uint32_t MainTId = (mapping::getNumberOfProcessorElements() - 1) & + ~(mapping::getWarpSize() - 1); + return mapping::getThreadIdInBlock() == MainTId; +} + +bool mapping::isLeaderInWarp() { + __kmpc_impl_lanemask_t Active = mapping::activemask(); + __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT(); + return utils::popc(Active & LaneMaskLT) == 0; +} + +LaneMaskTy mapping::activemask() { return impl::activemask(); } + +LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); } + +LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); } + +uint32_t mapping::getThreadIdInWarp() { return impl::getThreadIdInWarp(); } + +uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); } + +uint32_t mapping::getBlockSize() { return impl::getBlockSize(); } + +uint32_t mapping::getKernelSize() { return impl::getKernelSize(); } + +uint32_t mapping::getBlockId() { return impl::getBlockId(); } + +uint32_t mapping::getNumberOfBlocks() { return impl::getNumberOfBlocks(); } + +uint32_t mapping::getNumberOfProcessorElements() { + return impl::getNumberOfProcessorElements(); +} + +uint32_t mapping::getWarpId() { return impl::getWarpId(); } + +uint32_t mapping::getWarpSize() { return impl::getWarpSize(); } + +uint32_t mapping::getNumberOfWarpsInBlock() { + return impl::getNumberOfWarpsInBlock(); +} + +/// Execution mode +/// +///{ +static int SHARED(IsSPMDMode); + +void mapping::init(bool IsSPMD) { + if (!mapping::getThreadIdInBlock()) + IsSPMDMode = IsSPMD; +} + +bool mapping::isSPMDMode() { return IsSPMDMode; } + +bool mapping::isGenericMode() { return !isSPMDMode(); } +///} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp @@ -0,0 +1,73 @@ +//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "Types.h" + +#pragma omp declare target + +namespace _OMP { +namespace impl { + +/// AMDGCN Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +double getWTick() { return ((double)1E-9); } + +double getWTime() { + // The intrinsics for measuring time have undocumented frequency + // This will probably need to be found by measurement on a number of + // architectures. Until then, return 0, which is very inaccurate as a + // timer but resolves the undefined symbol at link time. + return 0; +} + +#pragma omp end declare variant + +/// NVPTX Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +double getWTick() { + // Timer precision is 1ns + return ((double)1E-9); +} + +double getWTime() { + unsigned long long nsecs; + asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs)); + return (double)nsecs * getWTick(); +} + +#pragma omp end declare variant + +} // namespace impl +} // namespace _OMP + +/// Interfaces +/// +///{ + +extern "C" { +int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; } + +int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; } + +double omp_get_wtick(void) { return _OMP::impl::getWTick(); } + +double omp_get_wtime(void) { return _OMP::impl::getWTime(); } +} + +///} +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -0,0 +1,198 @@ +//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Parallel implementation in the GPU. Here is the pattern: +// +// while (not finished) { +// +// if (master) { +// sequential code, decide which par loop to do, or if finished +// __kmpc_kernel_prepare_parallel() // exec by master only +// } +// syncthreads // A +// __kmpc_kernel_parallel() // exec by all +// if (this thread is included in the parallel) { +// switch () for all parallel loops +// __kmpc_kernel_end_parallel() // exec only by threads in parallel +// } +// +// +// The reason we don't exec end_parallel for the threads not included +// in the parallel loop is that for each barrier in the parallel +// region, these non-included threads will cycle through the +// syncthread A. Thus they must preserve their current threadId that +// is larger than thread in team. +// +// To make a long story short... +// +//===----------------------------------------------------------------------===// + +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "State.h" +#include "Synchronization.h" +#include "Types.h" +#include "Utils.h" + +using namespace _OMP; + +#pragma omp declare target + +namespace { + +uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { + uint32_t NThreadsICV = + NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; + uint32_t NumThreads = mapping::getBlockSize(); + + if (NThreadsICV != 0 && NThreadsICV < NumThreads) + NumThreads = NThreadsICV; + + // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP. + if (NumThreads < mapping::getWarpSize()) + NumThreads = 1; + else + NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); + + return NumThreads; +} + +// Invoke an outlined parallel function unwrapping arguments (up to 32). +void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn, + void **args, int64_t nargs) { + switch (nargs) { +#include "generated_microtask_cases.gen" + default: + PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n"); + __builtin_trap(); + } +} + +} // namespace + +extern "C" { + +void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, + int32_t num_threads, int proc_bind, void *fn, + void *wrapper_fn, void **args, int64_t nargs) { + + uint32_t TId = mapping::getThreadIdInBlock(); + // Handle the serialized case first, same for SPMD/non-SPMD. + if (OMP_UNLIKELY(!if_expr || icv::Level)) { + __kmpc_serialized_parallel(ident, TId); + invokeMicrotask(TId, 0, fn, args, nargs); + __kmpc_end_serialized_parallel(ident, TId); + return; + } + + uint32_t NumThreads = determineNumberOfThreads(num_threads); + if (mapping::isSPMDMode()) { + { + state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads, + 1u, TId == 0); + state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0); + state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0); + synchronize::threads(); + + if (TId < NumThreads) + invokeMicrotask(TId, 0, fn, args, nargs); + } + synchronize::threads(); + return; + } + + // We do *not* create a new data environment because all threads in the team + // that are active are now running this parallel region. They share the + // TeamState, which has an increase level-var and potentially active-level + // set, but they do not have individual ThreadStates yet. If they ever + // modify the ICVs beyond this point a ThreadStates will be allocated. + + bool IsActiveParallelRegion = NumThreads > 1; + if (!IsActiveParallelRegion) { + state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true); + invokeMicrotask(TId, 0, fn, args, nargs); + return; + } + + void **GlobalArgs = nullptr; + if (nargs) { + __kmpc_begin_sharing_variables(&GlobalArgs, nargs); +#pragma unroll + for (int I = 0; I < nargs; I++) + GlobalArgs[I] = args[I]; + } + + { + state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads, + 1u, true); + state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, + (void *)nullptr, true); + state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true); + state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true); + + // Master signals work to activate workers. + synchronize::threads(); + // Master waits for workers to signal. + synchronize::threads(); + } + + if (nargs) + memory::freeShared(GlobalArgs, nargs * sizeof(void *), + "global args free shared"); +} + +__attribute__((noinline)) bool +__kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { + // Work function and arguments for L1 parallel region. + *WorkFn = state::ParallelRegionFn; + + // If this is the termination signal from the master, quit early. + if (!*WorkFn) + return false; + + // Set to true for workers participating in the parallel region. + uint32_t TId = mapping::getThreadIdInBlock(); + bool ThreadIsActive = TId < state::ParallelTeamSize; + return ThreadIsActive; +} + +__attribute__((noinline)) void __kmpc_kernel_end_parallel() { + // In case we have modified an ICV for this thread before a ThreadState was + // created. We drop it now to not contaminate the next parallel region. + ASSERT(!mapping::isSPMDMode()); + uint32_t TId = mapping::getThreadIdInBlock(); + state::resetStateForThread(TId); + ASSERT(!mapping::isSPMDMode()); +} + +void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) { + state::enterDataEnvironment(); + ++icv::Level; +} + +void __kmpc_end_serialized_parallel(IdentTy *, uint32_t TId) { + state::exitDataEnvironment(); + --icv::Level; +} + +uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); } + +int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); } + +void __kmpc_push_num_threads(IdentTy *, int32_t, int32_t NumThreads) { + icv::NThreads = NumThreads; +} + +void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, + int32_t thread_limit) {} + +void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp @@ -0,0 +1,318 @@ +//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of reduction with KMPC interface. +// +//===----------------------------------------------------------------------===// + +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "State.h" +#include "Synchronization.h" +#include "Types.h" +#include "Utils.h" + +using namespace _OMP; + +namespace { + +#pragma omp declare target + +void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { + for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { + shflFct(reduce_data, /*LaneId - not used= */ 0, + /*Offset = */ mask, /*AlgoVersion=*/0); + } +} + +void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, + uint32_t size, uint32_t tid) { + uint32_t curr_size; + uint32_t mask; + curr_size = size; + mask = curr_size / 2; + while (mask > 0) { + shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); + curr_size = (curr_size + 1) / 2; + mask = curr_size / 2; + } +} + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +static uint32_t gpu_irregular_simd_reduce(void *reduce_data, + ShuffleReductFnTy shflFct) { + uint32_t size, remote_id, physical_lane_id; + physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); + __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); + __kmpc_impl_lanemask_t Liveness = mapping::activemask(); + uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; + __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); + do { + Liveness = mapping::activemask(); + remote_id = utils::ffs(Liveness & lanemask_gt); + size = utils::popc(Liveness); + logical_lane_id /= 2; + shflFct(reduce_data, /*LaneId =*/logical_lane_id, + /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); + } while (logical_lane_id % 2 == 0 && size > 1); + return (logical_lane_id == 0); +} +#endif + +static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars, + uint64_t reduce_size, + void *reduce_data, + ShuffleReductFnTy shflFct, + InterWarpCopyFnTy cpyFct, + bool isSPMDExecutionMode, bool) { + uint32_t BlockThreadId = mapping::getThreadIdInBlock(); + if (mapping::isMainThreadInGenericMode()) + BlockThreadId = 0; + uint32_t NumThreads = omp_get_num_threads(); + if (NumThreads == 1) + return 1; + /* + * This reduce function handles reduction within a team. It handles + * parallel regions in both L1 and L2 parallelism levels. It also + * supports Generic, SPMD, and NoOMP modes. + * + * 1. Reduce within a warp. + * 2. Warp master copies value to warp 0 via shared memory. + * 3. Warp 0 reduces to a single value. + * 4. The reduced value is available in the thread that returns 1. + */ + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + uint32_t WarpsNeeded = + (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); + uint32_t WarpId = mapping::getWarpId(); + + // Volta execution model: + // For the Generic execution mode a parallel region either has 1 thread and + // beyond that, always a multiple of 32. For the SPMD execution mode we may + // have any number of threads. + if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1)) + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/NumThreads % mapping::getWarpSize(), + /*LaneId=*/mapping::getThreadIdInBlock() % + mapping::getWarpSize()); + + // When we have more than [mapping::getWarpSize()] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > mapping::getWarpSize()) { + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + } + return BlockThreadId == 0; +#else + __kmpc_impl_lanemask_t Liveness = mapping::activemask(); + if (Liveness == lanes::All) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/utils::popc(Liveness), + /*LaneId=*/mapping::getThreadIdInBlock() % + mapping::getWarpSize()); + else { // Dispersed lanes. Only threads in L2 + // parallel region may enter here; return + // early. + return gpu_irregular_simd_reduce(reduce_data, shflFct); + } + + // When we have more than [mapping::getWarpSize()] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > mapping::getWarpSize()) { + uint32_t WarpsNeeded = + (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + + return BlockThreadId == 0; + } + + // Get the OMP thread Id. This is different from BlockThreadId in the case of + // an L2 parallel region. + return TId == 0; +#endif // __CUDA_ARCH__ >= 700 +} + +uint32_t roundToWarpsize(uint32_t s) { + if (s < mapping::getWarpSize()) + return 1; + return (s & ~(unsigned)(mapping::getWarpSize() - 1)); +} + +uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } + +static volatile uint32_t IterCnt = 0; +static volatile uint32_t Cnt = 0; + +} // namespace + +extern "C" { +int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size, + void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) { + return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data, + shflFct, cpyFct, mapping::isSPMDMode(), + false); +} + +int32_t __kmpc_nvptx_teams_reduce_nowait_v2( + IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records, + void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct, + ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct, + ListGlobalFnTy glredFct) { + + // Terminate all threads in non-SPMD mode except for the master thread. + uint32_t ThreadId = mapping::getThreadIdInBlock(); + if (mapping::isGenericMode()) { + if (!mapping::isMainThreadInGenericMode()) + return 0; + ThreadId = 0; + } + + // In non-generic mode all workers participate in the teams reduction. + // In generic mode only the team master participates in the teams + // reduction because the workers are waiting for parallel work. + uint32_t NumThreads = omp_get_num_threads(); + uint32_t TeamId = omp_get_team_num(); + uint32_t NumTeams = omp_get_num_teams(); + static unsigned SHARED(Bound); + static unsigned SHARED(ChunkTeamCount); + + // Block progress for teams greater than the current upper + // limit. We always only allow a number of teams less or equal + // to the number of slots in the buffer. + bool IsMaster = (ThreadId == 0); + while (IsMaster) { + Bound = atomic::read((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST); + if (TeamId < Bound + num_of_records) + break; + } + + if (IsMaster) { + int ModBockId = TeamId % num_of_records; + if (TeamId < num_of_records) { + lgcpyFct(GlobalBuffer, ModBockId, reduce_data); + } else + lgredFct(GlobalBuffer, ModBockId, reduce_data); + + fence::system(__ATOMIC_SEQ_CST); + + // Increment team counter. + // This counter is incremented by all teams in the current + // BUFFER_SIZE chunk. + ChunkTeamCount = + atomic::inc((uint32_t *)&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST); + } + // Synchronize + if (mapping::isSPMDMode()) + __kmpc_barrier(Loc, TId); + + // reduce_data is global or shared so before being reduced within the + // warp we need to bring it in local memory: + // local_reduce_data = reduce_data[i] + // + // Example for 3 reduction variables a, b, c (of potentially different + // types): + // + // buffer layout (struct of arrays): + // a, a, ..., a, b, b, ... b, c, c, ... c + // |__________| + // num_of_records + // + // local_data_reduce layout (struct): + // a, b, c + // + // Each thread will have a local struct containing the values to be + // reduced: + // 1. do reduction within each warp. + // 2. do reduction across warps. + // 3. write the final result to the main reduction variable + // by returning 1 in the thread holding the reduction result. + + // Check if this is the very last team. + unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records)); + if (ChunkTeamCount == NumTeams - Bound - 1) { + // + // Last team processing. + // + if (ThreadId >= NumRecs) + return 0; + NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs)); + if (ThreadId >= NumThreads) + return 0; + + // Load from buffer and reduce. + glcpyFct(GlobalBuffer, ThreadId, reduce_data); + for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) + glredFct(GlobalBuffer, i, reduce_data); + + // Reduce across warps to the warp master. + if (NumThreads > 1) { + gpu_regular_warp_reduce(reduce_data, shflFct); + + // When we have more than [mapping::getWarpSize()] number of threads + // a block reduction is performed here. + uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads); + if (ActiveThreads > mapping::getWarpSize()) { + uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) / + mapping::getWarpSize(); + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = ThreadId / mapping::getWarpSize(); + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + ThreadId); + } + } + + if (IsMaster) { + Cnt = 0; + IterCnt = 0; + return 1; + } + return 0; + } + if (IsMaster && ChunkTeamCount == num_of_records - 1) { + // Allow SIZE number of teams to proceed writing their + // intermediate results to the global buffer. + atomic::add((uint32_t *)&IterCnt, uint32_t(num_of_records), + __ATOMIC_SEQ_CST); + } + + return 0; +} + +void __kmpc_nvptx_end_reduce(int32_t TId) {} + +void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {} +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -0,0 +1,519 @@ +//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "State.h" +#include "Configuration.h" +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "Synchronization.h" +#include "Types.h" +#include "Utils.h" + +using namespace _OMP; + +#pragma omp declare target + +/// Memory implementation +/// +///{ + +namespace { + +/// Fallback implementations are missing to trigger a link time error. +/// Implementations for new devices, including the host, should go into a +/// dedicated begin/end declare variant. +/// +///{ + +extern "C" { +void *malloc(uint64_t Size); +void free(void *Ptr); +} + +///} + +/// AMDGCN implementations of the shuffle sync idiom. +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +extern "C" { +void *malloc(uint64_t Size) { + // TODO: Use some preallocated space for dynamic malloc. + return nullptr; +} + +void free(void *Ptr) {} +} + +#pragma omp end declare variant +///} + +/// Add worst-case padding so that future allocations are properly aligned. +constexpr const uint32_t Alignment = 8; + +/// A "smart" stack in shared memory. +/// +/// The stack exposes a malloc/free interface but works like a stack internally. +/// In fact, it is a separate stack *per warp*. That means, each warp must push +/// and pop symmetrically or this breaks, badly. The implementation will (aim +/// to) detect non-lock-step warps and fallback to malloc/free. The same will +/// happen if a warp runs out of memory. The master warp in generic memory is +/// special and is given more memory than the rest. +/// +struct SharedMemorySmartStackTy { + /// Initialize the stack. Must be called by all threads. + void init(bool IsSPMD); + + /// Allocate \p Bytes on the stack for the encountering thread. Each thread + /// can call this function. + void *push(uint64_t Bytes); + + /// Deallocate the last allocation made by the encountering thread and pointed + /// to by \p Ptr from the stack. Each thread can call this function. + void pop(void *Ptr, uint32_t Bytes); + +private: + /// Compute the size of the storage space reserved for a thread. + uint32_t computeThreadStorageTotal() { + uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements(); + return (state::SharedScratchpadSize - NumLanesInBlock + 1) / + NumLanesInBlock; + } + + /// Return the top address of the warp data stack, that is the first address + /// this warp will allocate memory at next. + void *getThreadDataTop(uint32_t TId) { + return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; + } + + /// The actual storage, shared among all warps. + unsigned char Data[state::SharedScratchpadSize] + __attribute__((aligned(Alignment))); + unsigned char Usage[mapping::MaxThreadsPerTeam] + __attribute__((aligned(Alignment))); +}; + +static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, + "Shared scratchpad of this size not supported yet."); + +/// The allocation of a single shared memory scratchpad. +static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack); + +void SharedMemorySmartStackTy::init(bool IsSPMD) { + Usage[mapping::getThreadIdInBlock()] = 0; +} + +void *SharedMemorySmartStackTy::push(uint64_t Bytes) { + // First align the number of requested bytes. + uint64_t AlignedBytes = (Bytes + (Alignment - 1)) / Alignment * Alignment; + + uint32_t StorageTotal = computeThreadStorageTotal(); + + // The main thread in generic mode gets the space of its entire warp as the + // other threads do not participate in any computation at all. + if (mapping::isMainThreadInGenericMode()) + StorageTotal *= mapping::getWarpSize(); + + int TId = mapping::getThreadIdInBlock(); + if (Usage[TId] + AlignedBytes <= StorageTotal) { + void *Ptr = getThreadDataTop(TId); + Usage[TId] += AlignedBytes; + return Ptr; + } + + return memory::allocGlobal(AlignedBytes, + "Slow path shared memory allocation, insufficient " + "shared memory stack memory!"); +} + +void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) { + uint64_t AlignedBytes = (Bytes + (Alignment - 1)) / Alignment * Alignment; + if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) { + int TId = mapping::getThreadIdInBlock(); + Usage[TId] -= AlignedBytes; + return; + } + memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); +} + +} // namespace + +void *memory::allocShared(uint64_t Bytes, const char *Reason) { + return SharedMemorySmartStack.push(Bytes); +} + +void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { + SharedMemorySmartStack.pop(Ptr, Bytes); +} + +void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { + return malloc(Bytes); +} + +void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } + +///} + +namespace { + +struct ICVStateTy { + uint32_t NThreadsVar; + uint32_t LevelVar; + uint32_t ActiveLevelVar; + uint32_t MaxActiveLevelsVar; + uint32_t RunSchedVar; + uint32_t RunSchedChunkVar; + + bool operator==(const ICVStateTy &Other) const; + + void assertEqual(const ICVStateTy &Other) const; +}; + +bool ICVStateTy::operator==(const ICVStateTy &Other) const { + return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & + (ActiveLevelVar == Other.ActiveLevelVar) & + (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & + (RunSchedVar == Other.RunSchedVar) & + (RunSchedChunkVar == Other.RunSchedChunkVar); +} + +void ICVStateTy::assertEqual(const ICVStateTy &Other) const { + ASSERT(NThreadsVar == Other.NThreadsVar); + ASSERT(LevelVar == Other.LevelVar); + ASSERT(ActiveLevelVar == Other.ActiveLevelVar); + ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar); + ASSERT(RunSchedVar == Other.RunSchedVar); + ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar); +} + +struct TeamStateTy { + /// TODO: provide a proper init function. + void init(bool IsSPMD); + + bool operator==(const TeamStateTy &) const; + + void assertEqual(TeamStateTy &Other) const; + + /// ICVs + /// + /// Preallocated storage for ICV values that are used if the threads have not + /// set a custom default. The latter is supported but unlikely and slow(er). + /// + ///{ + ICVStateTy ICVState; + ///} + + uint32_t ParallelTeamSize; + ParallelRegionFnTy ParallelRegionFnVar; +}; + +TeamStateTy SHARED(TeamState); + +void TeamStateTy::init(bool IsSPMD) { + ICVState.NThreadsVar = mapping::getBlockSize(); + ICVState.LevelVar = 0; + ICVState.ActiveLevelVar = 0; + ICVState.MaxActiveLevelsVar = 1; + ICVState.RunSchedVar = omp_sched_static; + ICVState.RunSchedChunkVar = 1; + ParallelTeamSize = 1; + ParallelRegionFnVar = nullptr; +} + +bool TeamStateTy::operator==(const TeamStateTy &Other) const { + return (ICVState == Other.ICVState) & + (ParallelTeamSize == Other.ParallelTeamSize); +} + +void TeamStateTy::assertEqual(TeamStateTy &Other) const { + ICVState.assertEqual(Other.ICVState); + ASSERT(ParallelTeamSize == Other.ParallelTeamSize); +} + +struct ThreadStateTy { + + /// ICVs have preallocated storage in the TeamStateTy which is used if a + /// thread has not set a custom value. The latter is supported but unlikely. + /// When it happens we will allocate dynamic memory to hold the values of all + /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an + /// ICV struct to hold them all. This is slower than alternatives but allows + /// users to pay only for what they use. + /// + ICVStateTy ICVState; + + ThreadStateTy *PreviousThreadState; + + void init() { + ICVState = TeamState.ICVState; + PreviousThreadState = nullptr; + } + + void init(ThreadStateTy &PreviousTS) { + ICVState = PreviousTS.ICVState; + PreviousThreadState = &PreviousTS; + } +}; + +__attribute__((loader_uninitialized)) +ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; +#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) + +uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var) { + if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0)) + return TeamState.ICVState.*Var; + uint32_t TId = mapping::getThreadIdInBlock(); + if (!ThreadStates[TId]) { + ThreadStates[TId] = reinterpret_cast(memory::allocGlobal( + sizeof(ThreadStateTy), "ICV modification outside data environment")); + ThreadStates[TId]->init(); + } + return ThreadStates[TId]->ICVState.*Var; +} + +uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) { + uint32_t TId = mapping::getThreadIdInBlock(); + if (OMP_UNLIKELY(ThreadStates[TId])) + return ThreadStates[TId]->ICVState.*Var; + return TeamState.ICVState.*Var; +} +uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) { + uint64_t TId = mapping::getThreadIdInBlock(); + if (OMP_UNLIKELY(ThreadStates[TId])) + return ThreadStates[TId]->ICVState.*Var; + return TeamState.ICVState.*Var; +} + +int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, + int OutOfBoundsVal = -1) { + if (Level == 0) + return DefaultVal; + int LevelVar = omp_get_level(); + if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) + return OutOfBoundsVal; + int ActiveLevel = icv::ActiveLevel; + if (OMP_UNLIKELY(Level != ActiveLevel)) + return DefaultVal; + return Val; +} + +} // namespace + +uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly) { + switch (Kind) { + case state::VK_NThreads: + if (IsReadonly) + return lookup32Impl(&ICVStateTy::NThreadsVar); + return lookupForModify32Impl(&ICVStateTy::NThreadsVar); + case state::VK_Level: + if (IsReadonly) + return lookup32Impl(&ICVStateTy::LevelVar); + return lookupForModify32Impl(&ICVStateTy::LevelVar); + case state::VK_ActiveLevel: + if (IsReadonly) + return lookup32Impl(&ICVStateTy::ActiveLevelVar); + return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar); + case state::VK_MaxActiveLevels: + if (IsReadonly) + return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar); + return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar); + case state::VK_RunSched: + if (IsReadonly) + return lookup32Impl(&ICVStateTy::RunSchedVar); + return lookupForModify32Impl(&ICVStateTy::RunSchedVar); + case state::VK_RunSchedChunk: + if (IsReadonly) + return lookup32Impl(&ICVStateTy::RunSchedChunkVar); + return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar); + case state::VK_ParallelTeamSize: + return TeamState.ParallelTeamSize; + default: + break; + } + __builtin_unreachable(); +} + +void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) { + switch (Kind) { + case state::VK_ParallelRegionFn: + return TeamState.ParallelRegionFnVar; + default: + break; + } + __builtin_unreachable(); +} + +void state::init(bool IsSPMD) { + SharedMemorySmartStack.init(IsSPMD); + if (!mapping::getThreadIdInBlock()) + TeamState.init(IsSPMD); + + ThreadStates[mapping::getThreadIdInBlock()] = nullptr; +} + +void state::enterDataEnvironment() { + unsigned TId = mapping::getThreadIdInBlock(); + ThreadStateTy *NewThreadState = + static_cast(__kmpc_alloc_shared(sizeof(ThreadStateTy))); + NewThreadState->init(*ThreadStates[TId]); + ThreadStates[TId] = NewThreadState; +} + +void state::exitDataEnvironment() { + unsigned TId = mapping::getThreadIdInBlock(); + resetStateForThread(TId); +} + +void state::resetStateForThread(uint32_t TId) { + if (OMP_LIKELY(!ThreadStates[TId])) + return; + + ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; + __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy)); + ThreadStates[TId] = PreviousThreadState; +} + +void state::runAndCheckState(void(Func(void))) { + TeamStateTy OldTeamState = TeamState; + OldTeamState.assertEqual(TeamState); + + Func(); + + OldTeamState.assertEqual(TeamState); +} + +void state::assumeInitialState(bool IsSPMD) { + TeamStateTy InitialTeamState; + InitialTeamState.init(IsSPMD); + InitialTeamState.assertEqual(TeamState); + ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]); + ASSERT(mapping::isSPMDMode() == IsSPMD); +} + +extern "C" { +void omp_set_dynamic(int V) {} + +int omp_get_dynamic(void) { return 0; } + +void omp_set_num_threads(int V) { icv::NThreads = V; } + +int omp_get_max_threads(void) { return icv::NThreads; } + +int omp_get_level(void) { + int LevelVar = icv::Level; + ASSERT(LevelVar >= 0); + return LevelVar; +} + +int omp_get_active_level(void) { return !!icv::ActiveLevel; } + +int omp_in_parallel(void) { return !!icv::ActiveLevel; } + +void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { + *ScheduleKind = static_cast((int)icv::RunSched); + *ChunkSize = state::RunSchedChunk; +} + +void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { + icv::RunSched = (int)ScheduleKind; + state::RunSchedChunk = ChunkSize; +} + +int omp_get_ancestor_thread_num(int Level) { + return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); +} + +int omp_get_thread_num(void) { + return omp_get_ancestor_thread_num(omp_get_level()); +} + +int omp_get_team_size(int Level) { + return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1); +} + +int omp_get_num_threads(void) { return state::ParallelTeamSize; } + +int omp_get_thread_limit(void) { return mapping::getKernelSize(); } + +int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } + +void omp_set_nested(int) {} + +int omp_get_nested(void) { return false; } + +void omp_set_max_active_levels(int Levels) { + icv::MaxActiveLevels = Levels > 0 ? 1 : 0; +} + +int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } + +omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } + +int omp_get_num_places(void) { return 0; } + +int omp_get_place_num_procs(int) { return omp_get_num_procs(); } + +void omp_get_place_proc_ids(int, int *) { + // TODO +} + +int omp_get_place_num(void) { return 0; } + +int omp_get_partition_num_places(void) { return 0; } + +void omp_get_partition_place_nums(int *) { + // TODO +} + +int omp_get_cancellation(void) { return 0; } + +void omp_set_default_device(int) {} + +int omp_get_default_device(void) { return -1; } + +int omp_get_num_devices(void) { return config::getNumDevices(); } + +int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); } + +int omp_get_team_num() { return mapping::getBlockId(); } + +int omp_get_initial_device(void) { return -1; } +} + +extern "C" { +__attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) { + return memory::allocShared(Bytes, "Frontend alloc shared"); +} + +__attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { + memory::freeShared(Ptr, Bytes, "Frontend free shared"); +} + +[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr; +#pragma omp allocate(SharedMemVariableSharingSpacePtr) \ + allocator(omp_pteam_mem_alloc) + +void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs) { + SharedMemVariableSharingSpacePtr = + (void **)__kmpc_alloc_shared(sizeof(void *) * NumArgs); + *GlobalArgs = SharedMemVariableSharingSpacePtr; +} + +void __kmpc_end_sharing_variables(void **GlobalArgsPtr, uint64_t NumArgs) { + __kmpc_free_shared(SharedMemVariableSharingSpacePtr, + sizeof(void *) * NumArgs); +} + +void __kmpc_get_shared_variables(void ***GlobalArgs) { + *GlobalArgs = SharedMemVariableSharingSpacePtr; +} +} +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -0,0 +1,314 @@ +//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Include all synchronization. +// +//===----------------------------------------------------------------------===// + +#include "Synchronization.h" + +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "State.h" +#include "Types.h" +#include "Utils.h" + +#pragma omp declare target + +using namespace _OMP; + +namespace impl { + +/// Atomics +/// +///{ +/// NOTE: This function needs to be implemented by every target. +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering); + +uint32_t atomicRead(uint32_t *Address, int Ordering) { + return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST); +} + +uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) { + return __atomic_fetch_add(Address, Val, Ordering); +} +uint32_t atomicMax(uint32_t *Address, uint32_t Val, int Ordering) { + return __atomic_fetch_max(Address, Val, Ordering); +} + +uint32_t atomicExchange(uint32_t *Address, uint32_t Val, int Ordering) { + uint32_t R; + __atomic_exchange(Address, &Val, &R, Ordering); + return R; +} +uint32_t atomicCAS(uint32_t *Address, uint32_t Compare, uint32_t Val, + int Ordering) { + (void)__atomic_compare_exchange(Address, &Compare, &Val, false, Ordering, + Ordering); + return Compare; +} + +uint64_t atomicAdd(uint64_t *Address, uint64_t Val, int Ordering) { + return __atomic_fetch_add(Address, Val, Ordering); +} +///} + +/// AMDGCN Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { + return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, ""); +} + +uint32_t SHARD(namedBarrierTracker); + +void namedBarrierInit() { + // Don't have global ctors, and shared memory is not zero init + atomic::store(&namedBarrierTracker, 0u, __ATOMIC_RELEASE); +} + +void namedBarrier() { + uint32_t NumThreads = omp_get_num_threads(); + // assert(NumThreads % 32 == 0); + + uint32_t WarpSize = maping::getWarpSize(); + uint32_t NumWaves = NumThreads / WarpSize; + + fence::team(__ATOMIC_ACQUIRE); + + // named barrier implementation for amdgcn. + // Uses two 16 bit unsigned counters. One for the number of waves to have + // reached the barrier, and one to count how many times the barrier has been + // passed. These are packed in a single atomically accessed 32 bit integer. + // Low bits for the number of waves, assumed zero before this call. + // High bits to count the number of times the barrier has been passed. + + // precondition: NumWaves != 0; + // invariant: NumWaves * WarpSize == NumThreads; + // precondition: NumWaves < 0xffffu; + + // Increment the low 16 bits once, using the lowest active thread. + if (mapping::isLeaderInWarp()) { + uint32_t load = atomic::add(&namedBarrierTracker, 1, + __ATOMIC_RELAXED); // commutative + + // Record the number of times the barrier has been passed + uint32_t generation = load & 0xffff0000u; + + if ((load & 0x0000ffffu) == (NumWaves - 1)) { + // Reached NumWaves in low bits so this is the last wave. + // Set low bits to zero and increment high bits + load += 0x00010000u; // wrap is safe + load &= 0xffff0000u; // because bits zeroed second + + // Reset the wave counter and release the waiting waves + atomic::store(&namedBarrierTracker, load, __ATOMIC_RELAXED); + } else { + // more waves still to go, spin until generation counter changes + do { + __builtin_amdgcn_s_sleep(0); + load = atomi::load(&namedBarrierTracker, __ATOMIC_RELAXED); + } while ((load & 0xffff0000u) == generation); + } + } + fence::team(__ATOMIC_RELEASE); +} + +void syncWarp(__kmpc_impl_lanemask_t) { + // AMDGCN doesn't need to sync threads in a warp +} + +void syncThreads() { __builtin_amdgcn_s_barrier(); } + +void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); } + +void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); } + +void fenceSystem(int Ordering) { __builtin_amdgcn_fence(Ordering, ""); } + +#pragma omp end declare variant +///} + +/// NVPTX Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { + return __nvvm_atom_inc_gen_ui(Address, Val); +} + +void namedBarrierInit() {} + +void namedBarrier() { + uint32_t NumThreads = omp_get_num_threads(); + ASSERT(NumThreads % 32 == 0); + + // The named barrier for active parallel threads of a team in an L1 parallel + // region to synchronize with each other. + constexpr int BarrierNo = 7; + asm volatile("barrier.sync %0, %1;" + : + : "r"(BarrierNo), "r"(NumThreads) + : "memory"); +} + +void fenceTeam(int) { __nvvm_membar_cta(); } + +void fenceKernel(int) { __nvvm_membar_gl(); } + +void fenceSystem(int) { __nvvm_membar_sys(); } + +void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); } + +void syncThreads() { + constexpr int BarrierNo = 8; + asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory"); +} + +constexpr uint32_t OMP_SPIN = 1000; +constexpr uint32_t UNSET = 0; +constexpr uint32_t SET = 1; + +// TODO: This seems to hide a bug in the declare variant handling. If it is +// called before it is defined +// here the overload won't happen. Investigate lalter! +void unsetLock(omp_lock_t *Lock) { + (void)atomicExchange((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST); +} + +int testLock(omp_lock_t *Lock) { + return atomicAdd((uint32_t *)Lock, 0u, __ATOMIC_SEQ_CST); +} + +void initLock(omp_lock_t *Lock) { unsetLock(Lock); } + +void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); } + +void setLock(omp_lock_t *Lock) { + // TODO: not sure spinning is a good idea here.. + while (atomicCAS((uint32_t *)Lock, UNSET, SET, __ATOMIC_SEQ_CST) != UNSET) { + int32_t start = __nvvm_read_ptx_sreg_clock(); + int32_t now; + for (;;) { + now = __nvvm_read_ptx_sreg_clock(); + int32_t cycles = now > start ? now - start : now + (0xffffffff - start); + if (cycles >= OMP_SPIN * mapping::getBlockId()) { + break; + } + } + } // wait for 0 to be the read value +} + +#pragma omp end declare variant +///} + +} // namespace impl + +void synchronize::init(bool IsSPMD) { + if (!IsSPMD) + impl::namedBarrierInit(); +} + +void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); } + +void synchronize::threads() { impl::syncThreads(); } + +void fence::team(int Ordering) { impl::fenceTeam(Ordering); } + +void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); } + +void fence::system(int Ordering) { impl::fenceSystem(Ordering); } + +uint32_t atomic::read(uint32_t *Addr, int Ordering) { + return impl::atomicRead(Addr, Ordering); +} + +uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) { + return impl::atomicInc(Addr, V, Ordering); +} + +uint32_t atomic::add(uint32_t *Addr, uint32_t V, int Ordering) { + return impl::atomicAdd(Addr, V, Ordering); +} + +uint64_t atomic::add(uint64_t *Addr, uint64_t V, int Ordering) { + return impl::atomicAdd(Addr, V, Ordering); +} + +extern "C" { +void __kmpc_ordered(IdentTy *Loc, int32_t TId) {} + +void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {} + +int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) { + __kmpc_barrier(Loc, TId); + return 0; +} + +void __kmpc_barrier(IdentTy *Loc, int32_t TId) { + if (mapping::isMainThreadInGenericMode()) + return __kmpc_flush(Loc); + + if (mapping::isSPMDMode()) + return __kmpc_barrier_simple_spmd(Loc, TId); + + impl::namedBarrier(); +} + +__attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc, + int32_t TId) { + synchronize::threads(); +} + +int32_t __kmpc_master(IdentTy *Loc, int32_t TId) { + return omp_get_team_num() == 0; +} + +void __kmpc_end_master(IdentTy *Loc, int32_t TId) {} + +int32_t __kmpc_single(IdentTy *Loc, int32_t TId) { + return __kmpc_master(Loc, TId); +} + +void __kmpc_end_single(IdentTy *Loc, int32_t TId) { + // The barrier is explicitly called. +} + +void __kmpc_flush(IdentTy *Loc) { fence::kernel(__ATOMIC_SEQ_CST); } + +__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() { + return mapping::activemask(); +} + +void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) { synchronize::warp(Mask); } + +void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { + omp_set_lock(reinterpret_cast(Name)); +} + +void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { + omp_unset_lock(reinterpret_cast(Name)); +} + +void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } + +void omp_destroy_lock(omp_lock_t *Lock) { impl::destoryLock(Lock); } + +void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); } + +void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); } + +int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); } +} // extern "C" + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Tasking.cpp b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp @@ -0,0 +1,104 @@ +//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Task implementation support. +// +// TODO: We should not allocate and execute the task in two steps. A new API is +// needed for that though. +// +//===----------------------------------------------------------------------===// + +#include "Interface.h" +#include "State.h" +#include "Types.h" +#include "Utils.h" + +using namespace _OMP; + +#pragma omp declare target + +TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t, + uint64_t TaskSizeInclPrivateValues, + uint64_t SharedValuesSize, + TaskFnTy TaskFn) { + auto TaskSizeInclPrivateValuesPadded = + utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *))); + auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize; + TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal( + TaskSizeTotal, "explicit task descriptor"); + TaskDescriptor->Payload = + utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded); + TaskDescriptor->TaskFn = TaskFn; + + return TaskDescriptor; +} + +int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor) { + return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); +} + +int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor, int32_t, + void *, int32_t, void *) { + state::DateEnvironmentRAII DERAII; + + TaskDescriptor->TaskFn(0, TaskDescriptor); + + memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); + return 0; +} + +void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor) { + state::enterDataEnvironment(); +} + +void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor) { + state::exitDataEnvironment(); + + memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); +} + +void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, + void *) {} + +void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {} + +void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {} + +int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; } + +int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; } + +void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor, int, + uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, + int32_t, uint64_t, void *) { + // Skip task entirely if empty iteration space. + if (*LowerBound > *UpperBound) + return; + + // The compiler has already stored lb and ub in the TaskDescriptorTy structure + // as we are using a single task to execute the entire loop, we can leave + // the initial task_t untouched + __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); +} + +int omp_in_final(void) { + // treat all tasks as final... Specs may expect runtime to keep + // track more precisely if a task was actively set by users... This + // is not explicitly specified; will treat as if runtime can + // actively decide to put a non-final task into a final one. + return 1; +} + +int omp_get_max_task_priority(void) { return 0; } + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -0,0 +1,141 @@ +//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "Utils.h" + +#include "Interface.h" +#include "Mapping.h" + +#pragma omp declare target + +using namespace _OMP; + +namespace _OMP { +/// Helper to keep code alive without introducing a performance penalty. +__attribute__((used, weak, optnone)) void keepAlive() { + __kmpc_barrier_simple_spmd(nullptr, 0); +} +} // namespace _OMP + +namespace impl { + +/// AMDGCN Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { + *LowBits = (uint32_t)(Val & UINT64_C(0x00000000FFFFFFFF)); + *HighBits = (uint32_t)((Val & UINT64_C(0xFFFFFFFF00000000)) >> 32); +} + +uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { + return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; +} + +#pragma omp end declare variant + +/// NVPTX Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { + uint32_t LowBitsLocal, HighBitsLocal; + asm("mov.b64 {%0,%1}, %2;" + : "=r"(LowBitsLocal), "=r"(HighBitsLocal) + : "l"(Val)); + *LowBits = LowBitsLocal; + *HighBits = HighBitsLocal; +} + +uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { + uint64_t Val; + asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits)); + return Val; +} + +#pragma omp end declare variant + +/// AMDGCN Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { + int Width = mapping::getWarpSize(); + int Self = mapping::getgetThreadIdInWarp(); + int Index = SrcLane + (Self & ~(Width - 1)); + return __builtin_amdgcn_ds_bpermute(Index << 2, Var); +} + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, + int32_t Width) { + int Self = mapping::getThreadIdInWarp(); + int Index = Self + LaneDelta; + Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index; + return __builtin_amdgcn_ds_bpermute(Index << 2, Var); +} + +#pragma omp end declare variant +///} + +/// NVPTX Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { + return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f); +} + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { + int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f; + return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T); +} + +#pragma omp end declare variant +} // namespace impl + +uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { + return impl::Pack(LowBits, HighBits); +} + +void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) { + impl::Unpack(Val, &LowBits, &HighBits); +} + +int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { + return impl::shuffle(Mask, Var, SrcLane); +} + +int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, + int32_t Width) { + return impl::shuffleDown(Mask, Var, Delta, Width); +} + +extern "C" { +int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) { + return impl::shuffleDown(lanes::All, Val, Delta, SrcLane); +} + +int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) { + uint32_t lo, hi; + utils::unpack(Val, lo, hi); + hi = impl::shuffleDown(lanes::All, hi, Delta, Width); + lo = impl::shuffleDown(lanes::All, lo, Delta, Width); + return utils::pack(lo, hi); +} +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp @@ -0,0 +1,598 @@ +//===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the KMPC interface +// for the loop construct plus other worksharing constructs that use the same +// interface as loops. +// +//===----------------------------------------------------------------------===// + +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "State.h" +#include "Synchronization.h" +#include "Types.h" +#include "Utils.h" + +using namespace _OMP; + +// TODO: +struct DynamicScheduleTracker { + int64_t Chunk; + int64_t LoopUpperBound; + int64_t NextLowerBound; + int64_t Stride; + kmp_sched_t ScheduleType; + DynamicScheduleTracker *NextDST; +}; + +#define ASSERT0(...) + +// used by the library for the interface with the app +#define DISPATCH_FINISHED 0 +#define DISPATCH_NOTFINISHED 1 + +// used by dynamic scheduling +#define FINISHED 0 +#define NOT_FINISHED 1 +#define LAST_CHUNK 2 + +#pragma omp declare target + +// TODO: This variable is a hack inherited from the old runtime. +uint64_t SHARED(Cnt); + +template struct omptarget_nvptx_LoopSupport { + //////////////////////////////////////////////////////////////////////////////// + // Loop with static scheduling with chunk + + // Generic implementation of OMP loop scheduling with static policy + /*! \brief Calculate initial bounds for static loop and stride + * @param[in] loc location in code of the call (not used here) + * @param[in] global_tid global thread id + * @param[in] schetype type of scheduling (see omptarget-nvptx.h) + * @param[in] plastiter pointer to last iteration + * @param[in,out] pointer to loop lower bound. it will contain value of + * lower bound of first chunk + * @param[in,out] pointer to loop upper bound. It will contain value of + * upper bound of first chunk + * @param[in,out] pointer to loop stride. It will contain value of stride + * between two successive chunks executed by the same thread + * @param[in] loop increment bump + * @param[in] chunk size + */ + + // helper function for static chunk + static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk, + T entityId, T numberOfEntities) { + // each thread executes multiple chunks all of the same size, except + // the last one + // distance between two successive chunks + stride = numberOfEntities * chunk; + lb = lb + entityId * chunk; + T inputUb = ub; + ub = lb + chunk - 1; // Clang uses i <= ub + // Say ub' is the begining of the last chunk. Then who ever has a + // lower bound plus a multiple of the increment equal to ub' is + // the last one. + T beginingLastChunk = inputUb - (inputUb % chunk); + last = ((beginingLastChunk - lb) % stride) == 0; + } + + //////////////////////////////////////////////////////////////////////////////// + // Loop with static scheduling without chunk + + // helper function for static no chunk + static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk, + T entityId, T numberOfEntities) { + // No chunk size specified. Each thread or warp gets at most one + // chunk; chunks are all almost of equal size + T loopSize = ub - lb + 1; + + chunk = loopSize / numberOfEntities; + T leftOver = loopSize - chunk * numberOfEntities; + + if (entityId < leftOver) { + chunk++; + lb = lb + entityId * chunk; + } else { + lb = lb + entityId * chunk + leftOver; + } + + T inputUb = ub; + ub = lb + chunk - 1; // Clang uses i <= ub + last = lb <= inputUb && inputUb <= ub; + stride = loopSize; // make sure we only do 1 chunk per warp + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for Static Init + + static void for_static_init(int32_t gtid, int32_t schedtype, + int32_t *plastiter, T *plower, T *pupper, + ST *pstride, ST chunk, bool IsSPMDExecutionMode) { + // When IsRuntimeUninitialized is true, we assume that the caller is + // in an L0 parallel region and that all worker threads participate. + + // Assume we are in teams region or that we use a single block + // per target region + int numberOfActiveOMPThreads = omp_get_num_threads(); + + // All warps that are in excess of the maximum requested, do + // not execute the loop + ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, + "current thread is not needed here; error"); + + // copy + int lastiter = 0; + T lb = *plower; + T ub = *pupper; + ST stride = *pstride; + + // init + switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { + case kmp_sched_static_chunk: { + if (chunk > 0) { + ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + break; + } + } // note: if chunk <=0, use nochunk + case kmp_sched_static_balanced_chunk: { + if (chunk > 0) { + // round up to make sure the chunk is enough to cover all iterations + T tripCount = ub - lb + 1; // +1 because ub is inclusive + T span = (tripCount + numberOfActiveOMPThreads - 1) / + numberOfActiveOMPThreads; + // perform chunk adjustment + chunk = (span + chunk - 1) & ~(chunk - 1); + + ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); + T oldUb = ub; + ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + if (ub > oldUb) + ub = oldUb; + break; + } + } // note: if chunk <=0, use nochunk + case kmp_sched_static_nochunk: { + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + break; + } + case kmp_sched_distr_static_chunk: { + if (chunk > 0) { + ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), + omp_get_num_teams()); + break; + } // note: if chunk <=0, use nochunk + } + case kmp_sched_distr_static_nochunk: { + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), + omp_get_num_teams()); + break; + } + case kmp_sched_distr_static_chunk_sched_static_chunkone: { + ForStaticChunk(lastiter, lb, ub, stride, chunk, + numberOfActiveOMPThreads * omp_get_team_num() + gtid, + omp_get_num_teams() * numberOfActiveOMPThreads); + break; + } + default: { + // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); + ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + break; + } + } + // copy back + *plastiter = lastiter; + *plower = lb; + *pupper = ub; + *pstride = stride; + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for dispatch Init + + static int OrderedSchedule(kmp_sched_t schedule) { + return schedule >= kmp_sched_ordered_first && + schedule <= kmp_sched_ordered_last; + } + + static void dispatch_init(IdentTy *loc, int32_t threadId, + kmp_sched_t schedule, T lb, T ub, ST st, ST chunk, + DynamicScheduleTracker *DST) { + int tid = mapping::getThreadIdInBlock(); + T tnum = omp_get_num_threads(); + T tripCount = ub - lb + 1; // +1 because ub is inclusive + ASSERT0(LT_FUSSY, threadId < tnum, + "current thread is not needed here; error"); + + /* Currently just ignore the monotonic and non-monotonic modifiers + * (the compiler isn't producing them * yet anyway). + * When it is we'll want to look at them somewhere here and use that + * information to add to our schedule choice. We shouldn't need to pass + * them on, they merely affect which schedule we can legally choose for + * various dynamic cases. (In particular, whether or not a stealing scheme + * is legal). + */ + schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); + + // Process schedule. + if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { + if (OrderedSchedule(schedule)) + __kmpc_barrier(loc, threadId); + schedule = kmp_sched_static_chunk; + chunk = tripCount; // one thread gets the whole loop + } else if (schedule == kmp_sched_runtime) { + // process runtime + omp_sched_t rtSched; + int ChunkInt; + omp_get_schedule(&rtSched, &ChunkInt); + chunk = ChunkInt; + switch (rtSched) { + case omp_sched_static: { + if (chunk > 0) + schedule = kmp_sched_static_chunk; + else + schedule = kmp_sched_static_nochunk; + break; + } + case omp_sched_auto: { + schedule = kmp_sched_static_chunk; + chunk = 1; + break; + } + case omp_sched_dynamic: + case omp_sched_guided: { + schedule = kmp_sched_dynamic; + break; + } + } + } else if (schedule == kmp_sched_auto) { + schedule = kmp_sched_static_chunk; + chunk = 1; + } else { + // ASSERT(LT_FUSSY, + // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, + // "unknown schedule %d & chunk %lld\n", (int)schedule, + // (long long)chunk); + } + + // init schedules + if (schedule == kmp_sched_static_chunk) { + ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); + // save sched state + DST->ScheduleType = schedule; + // save ub + DST->LoopUpperBound = ub; + // compute static chunk + ST stride; + int lastiter = 0; + ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + // save computed params + DST->Chunk = chunk; + DST->NextLowerBound = lb; + DST->Stride = stride; + } else if (schedule == kmp_sched_static_balanced_chunk) { + ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); + // save sched state + DST->ScheduleType = schedule; + // save ub + DST->LoopUpperBound = ub; + // compute static chunk + ST stride; + int lastiter = 0; + // round up to make sure the chunk is enough to cover all iterations + T span = (tripCount + tnum - 1) / tnum; + // perform chunk adjustment + chunk = (span + chunk - 1) & ~(chunk - 1); + + T oldUb = ub; + ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); + if (ub > oldUb) + ub = oldUb; + // save computed params + DST->Chunk = chunk; + DST->NextLowerBound = lb; + DST->Stride = stride; + } else if (schedule == kmp_sched_static_nochunk) { + ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); + // save sched state + DST->ScheduleType = schedule; + // save ub + DST->LoopUpperBound = ub; + // compute static chunk + ST stride; + int lastiter = 0; + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + // save computed params + DST->Chunk = chunk; + DST->NextLowerBound = lb; + DST->Stride = stride; + } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { + // save data + DST->ScheduleType = schedule; + if (chunk < 1) + chunk = 1; + DST->Chunk = chunk; + DST->LoopUpperBound = ub; + DST->NextLowerBound = lb; + __kmpc_barrier(loc, threadId); + if (tid == 0) { + Cnt = 0; + fence::team(__ATOMIC_SEQ_CST); + } + __kmpc_barrier(loc, threadId); + } + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for dispatch next + + static uint64_t NextIter() { + __kmpc_impl_lanemask_t active = mapping::activemask(); + uint32_t leader = utils::ffs(active) - 1; + uint32_t change = utils::popc(active); + __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT(); + unsigned int rank = utils::popc(active & lane_mask_lt); + uint64_t warp_res; + if (rank == 0) { + warp_res = atomic::add(&Cnt, change, __ATOMIC_SEQ_CST); + } + warp_res = utils::shuffle(active, warp_res, leader); + return warp_res + rank; + } + + static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound, + T loopUpperBound) { + T N = NextIter(); + lb = loopLowerBound + N * chunkSize; + ub = lb + chunkSize - 1; // Clang uses i <= ub + + // 3 result cases: + // a. lb and ub < loopUpperBound --> NOT_FINISHED + // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> + // NOT_FINISHED + // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED + // a. + if (lb <= loopUpperBound && ub < loopUpperBound) { + return NOT_FINISHED; + } + // b. + if (lb <= loopUpperBound) { + ub = loopUpperBound; + return LAST_CHUNK; + } + // c. if we are here, we are in case 'c' + lb = loopUpperBound + 2; + ub = loopUpperBound + 1; + return FINISHED; + } + + static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast, + T *plower, T *pupper, ST *pstride, + DynamicScheduleTracker *DST) { + // ID of a thread in its own warp + + // automatically selects thread or warp ID based on selected implementation + ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(), + "current thread is not needed here; error"); + // retrieve schedule + kmp_sched_t schedule = DST->ScheduleType; + + // xxx reduce to one + if (schedule == kmp_sched_static_chunk || + schedule == kmp_sched_static_nochunk) { + T myLb = DST->NextLowerBound; + T ub = DST->LoopUpperBound; + // finished? + if (myLb > ub) { + return DISPATCH_FINISHED; + } + // not finished, save current bounds + ST chunk = DST->Chunk; + *plower = myLb; + T myUb = myLb + chunk - 1; // Clang uses i <= ub + if (myUb > ub) + myUb = ub; + *pupper = myUb; + *plast = (int32_t)(myUb == ub); + + // increment next lower bound by the stride + ST stride = DST->Stride; + DST->NextLowerBound = myLb + stride; + return DISPATCH_NOTFINISHED; + } + ASSERT0(LT_FUSSY, + schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, + "bad sched"); + T myLb, myUb; + int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound, + DST->LoopUpperBound); + + if (finished == FINISHED) + return DISPATCH_FINISHED; + + // not finished (either not finished or last chunk) + *plast = (int32_t)(finished == LAST_CHUNK); + *plower = myLb; + *pupper = myUb; + *pstride = 1; + + return DISPATCH_NOTFINISHED; + } + + static void dispatch_fini() { + // nothing + } + + //////////////////////////////////////////////////////////////////////////////// + // end of template class that encapsulate all the helper functions + //////////////////////////////////////////////////////////////////////////////// +}; + +//////////////////////////////////////////////////////////////////////////////// +// KMP interface implementation (dyn loops) +//////////////////////////////////////////////////////////////////////////////// + +// TODO: This is a stopgap. We probably want to expand the dispatch API to take +// an DST pointer which can then be allocated properly without malloc. +DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr); + +// Create a new DST, link the current one, and define the new as current. +static DynamicScheduleTracker *pushDST() { + DynamicScheduleTracker *NewDST = static_cast( + memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST")); + *NewDST = DynamicScheduleTracker({0}); + NewDST->NextDST = ThreadDSTPtr; + ThreadDSTPtr = NewDST; + return ThreadDSTPtr; +} + +// Return the current DST. +static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; } + +// Pop the current DST and restore the last one. +static void popDST() { + DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST; + memory::freeGlobal(ThreadDSTPtr, "remove DST"); + ThreadDSTPtr = OldDST; +} + +extern "C" { + +// init +void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule, + int32_t lb, int32_t ub, int32_t st, int32_t chunk) { + DynamicScheduleTracker *DST = pushDST(); + omptarget_nvptx_LoopSupport::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); +} + +void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule, + uint32_t lb, uint32_t ub, int32_t st, + int32_t chunk) { + DynamicScheduleTracker *DST = pushDST(); + omptarget_nvptx_LoopSupport::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); +} + +void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule, + int64_t lb, int64_t ub, int64_t st, int64_t chunk) { + DynamicScheduleTracker *DST = pushDST(); + omptarget_nvptx_LoopSupport::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); +} + +void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule, + uint64_t lb, uint64_t ub, int64_t st, + int64_t chunk) { + DynamicScheduleTracker *DST = pushDST(); + omptarget_nvptx_LoopSupport::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); +} + +// next +int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last, + int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { + DynamicScheduleTracker *DST = peekDST(); + return omptarget_nvptx_LoopSupport::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st, DST); +} + +int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last, + uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { + DynamicScheduleTracker *DST = peekDST(); + return omptarget_nvptx_LoopSupport::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st, DST); +} + +int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last, + int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { + DynamicScheduleTracker *DST = peekDST(); + return omptarget_nvptx_LoopSupport::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st, DST); +} + +int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last, + uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { + DynamicScheduleTracker *DST = peekDST(); + return omptarget_nvptx_LoopSupport::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st, DST); +} + +// fini +void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) { + omptarget_nvptx_LoopSupport::dispatch_fini(); + popDST(); +} + +void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) { + omptarget_nvptx_LoopSupport::dispatch_fini(); + popDST(); +} + +void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) { + omptarget_nvptx_LoopSupport::dispatch_fini(); + popDST(); +} + +void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) { + omptarget_nvptx_LoopSupport::dispatch_fini(); + popDST(); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP interface implementation (static loops) +//////////////////////////////////////////////////////////////////////////////// + +void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, int32_t chunk) { + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + mapping::isSPMDMode()); +} + +void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, int32_t chunk) { + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + mapping::isSPMDMode()); +} + +void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, int64_t chunk) { + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + mapping::isSPMDMode()); +} + +void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, int64_t chunk) { + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + mapping::isSPMDMode()); +} + +void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {} +} + +#pragma omp end declare target