diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -55,6 +55,7 @@ DIRECTORY) set(cuda_sources + ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip ${devicertl_base_directory}/common/src/cancel.cu ${devicertl_base_directory}/common/src/critical.cu ${devicertl_base_directory}/common/src/data_sharing.cu diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -1,4 +1,4 @@ -//===------------ target_impl.h - AMDGCN OpenMP GPU options ------ CUDA -*-===// +//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,11 +6,11 @@ // //===----------------------------------------------------------------------===// // -// Definitions of target specific functions +// Declarations and definitions of target specific functions and constants // //===----------------------------------------------------------------------===// -#ifndef _TARGET_IMPL_H_ -#define _TARGET_IMPL_H_ +#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H +#define OMPTARGET_AMDGCN_TARGET_IMPL_H #ifndef __AMDGCN__ #error "amdgcn target_impl.h expects to be compiled under __AMDGCN__" @@ -40,13 +40,12 @@ #define WARPSIZE 64 - // The named barrier for active parallel threads of a team in an L1 parallel // region to synchronize with each other. #define L1_BARRIER (1) -// Maximum number of preallocated arguments to an outlined parallel/simd function. -// Anything more requires dynamic memory allocation. +// Maximum number of preallocated arguments to an outlined parallel/simd +// function. Anything more requires dynamic memory allocation. #define MAX_SHARED_ARGS 20 // Maximum number of omp state objects per SM allocated statically in global @@ -54,7 +53,6 @@ #define OMP_STATE_COUNT 32 #define MAX_SM 64 - #define OMP_ACTIVE_PARALLEL_LEVEL 128 // Data sharing related quantities, need to match what is used in the compiler. @@ -69,18 +67,6 @@ DS_Max_Warp_Number = 16, }; -// warp vote function -EXTERN uint64_t __ballot64(int predicate); -// initialized with a 64-bit mask with bits set in positions less than the -// thread's lane number in the warp -EXTERN uint64_t __lanemask_lt(); -// initialized with a 64-bit mask with bits set in positions greater than the -// thread's lane number in the warp -EXTERN uint64_t __lanemask_gt(); - -// CU id -EXTERN unsigned __smid(); - INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF)); hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32); @@ -93,24 +79,15 @@ static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes = UINT64_C(0xffffffffffffffff); -INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { - return __lanemask_lt(); -} +DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); -INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { - return __lanemask_gt(); -} +DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); -INLINE uint32_t __kmpc_impl_smid() { - return __smid(); -} +DEVICE uint32_t __kmpc_impl_smid(); -INLINE double __kmpc_impl_get_wtick() { return ((double)1E-9); } +DEVICE double __kmpc_impl_get_wtick(); -EXTERN uint64_t __clock64(); -INLINE double __kmpc_impl_get_wtime() { - return ((double)1.0 / 745000000.0) * __clock64(); -} +DEVICE double __kmpc_impl_get_wtime(); INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); } @@ -120,14 +97,12 @@ return x < y ? x : y; } -INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { - return __ballot64(1); -} +DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask(); -EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, +DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, int32_t SrcLane); -EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var, +DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var, uint32_t Delta, int32_t Width); INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); } @@ -143,40 +118,36 @@ __builtin_amdgcn_s_barrier(); } -EXTERN void __kmpc_impl_threadfence(void); -EXTERN void __kmpc_impl_threadfence_block(void); -EXTERN void __kmpc_impl_threadfence_system(void); +DEVICE void __kmpc_impl_threadfence(void); +DEVICE void __kmpc_impl_threadfence_block(void); +DEVICE void __kmpc_impl_threadfence_system(void); // Calls to the AMDGCN layer (assuming 1D layout) -EXTERN uint64_t __ockl_get_local_size(uint32_t); -EXTERN uint64_t __ockl_get_num_groups(uint32_t); INLINE int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } INLINE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); } -INLINE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); } -INLINE int GetNumberOfThreadsInBlock() { return __ockl_get_local_size(0); } +DEVICE int GetNumberOfBlocksInKernel(); +DEVICE int GetNumberOfThreadsInBlock(); -EXTERN bool __kmpc_impl_is_first_active_thread(); +DEVICE bool __kmpc_impl_is_first_active_thread(); // Locks -EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock); -EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock); -EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock); -EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock); -EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock); +DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock); +DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock); +DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock); +DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock); +DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock); // Memory -EXTERN void *__kmpc_impl_malloc(size_t x); -EXTERN void __kmpc_impl_free(void *x); +DEVICE void *__kmpc_impl_malloc(size_t x); +DEVICE void __kmpc_impl_free(void *x); // DEVICE versions of part of libc -extern "C" { -DEVICE __attribute__((noreturn)) void +EXTERN __attribute__((noreturn)) void __assertfail(const char *, const char *, unsigned, const char *, size_t); -INLINE static void __assert_fail(const char *__message, const char *__file, - unsigned int __line, const char *__function) { +INLINE void __assert_fail(const char *__message, const char *__file, + unsigned int __line, const char *__function) { __assertfail(__message, __file, __line, __function, sizeof(char)); } -DEVICE int printf(const char *, ...); -} +EXTERN int printf(const char *, ...); #endif diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -0,0 +1,25 @@ +//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Definitions of target specific functions +// +//===----------------------------------------------------------------------===// + +#include "target_impl.h" + +DEVICE double __kmpc_impl_get_wtick() { return ((double)1E-9); } + +EXTERN uint64_t __clock64(); +DEVICE double __kmpc_impl_get_wtime() { + return ((double)1.0 / 745000000.0) * __clock64(); +} + +EXTERN uint64_t __ockl_get_local_size(uint32_t); +EXTERN uint64_t __ockl_get_num_groups(uint32_t); +DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); } +DEVICE int GetNumberOfThreadsInBlock() { return __ockl_get_local_size(0); }