diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -90,22 +90,26 @@ endif() # create libraries -set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900) +set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx906) if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST) set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST}) endif() macro(add_cuda_bc_library) set(cu_cmd ${AOMP_BINDIR}/clang++ + -xc++ + -c -std=c++14 - -fcuda-rdc + -target amdgcn + -emit-llvm + -Xclang -aux-triple -Xclang x86_64-unknown-linux-gnu # see nvptx + -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device + -D__AMDGCN__ + -Xclang -target-cpu -Xclang ${mcpu} -fvisibility=default - --cuda-device-only -Wno-unused-value - -x hip - -nogpulib -nogpuinc + -nogpulib -O${optimization_level} - --cuda-gpu-arch=${mcpu} ${CUDA_DEBUG} -I${CMAKE_CURRENT_SOURCE_DIR}/src -I${devicertl_base_directory}) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h @@ -11,7 +11,7 @@ #include -#define EXTERN extern "C" __attribute__((device)) +#define EXTERN extern "C" typedef uint64_t __kmpc_impl_lanemask_t; typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_intrinsics.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_intrinsics.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_intrinsics.h @@ -0,0 +1,46 @@ +//===--- amdgcn_intrinsics.h - Intrinsics used by deviceRTL ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _AMDGCN_INTRINSICS_H_ +#define _AMDGCN_INTRINSICS_H_ + +#include + +#include "target_impl.h" + +#ifdef _OPENMP +// Openmp doesn't pull these builtins into scope, but does error if the type is +// incorrect +// This may be a quirk of openmp's compile for host + device assumption, where +// these don't resolve to anything on the host + +EXTERN uint32_t __builtin_amdgcn_atomic_inc32(volatile uint32_t *, uint32_t, + uint32_t, const char *); +EXTERN void __builtin_amdgcn_s_barrier(void); +EXTERN void __builtin_amdgcn_fence(uint32_t, const char *); + +EXTERN void __builtin_amdgcn_s_sleep(int); + +EXTERN uint32_t __builtin_amdgcn_workitem_id_x(void); +EXTERN uint32_t __builtin_amdgcn_workgroup_id_x(void); +EXTERN uint16_t __builtin_amdgcn_workgroup_size_x(void); +EXTERN uint32_t __builtin_amdgcn_grid_size_x(void); + +EXTERN uint64_t __builtin_amdgcn_s_memrealtime(void); +EXTERN uint32_t __builtin_amdgcn_s_getreg(int32_t); +EXTERN uint64_t __builtin_amdgcn_read_exec(void); + +EXTERN __attribute__((address_space(4))) void * +__builtin_amdgcn_dispatch_ptr() noexcept; + +EXTERN uint32_t __builtin_amdgcn_mbcnt_lo(uint32_t, uint32_t); +EXTERN uint32_t __builtin_amdgcn_mbcnt_hi(uint32_t, uint32_t); +EXTERN int32_t __builtin_amdgcn_ds_bpermute(int32_t, int32_t); +#endif + +#endif diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip @@ -26,6 +26,6 @@ DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); } DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); } DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); } -DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); } +DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); return 0;} #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip @@ -8,6 +8,7 @@ #pragma omp declare target #include "target_impl.h" +#include "amdgcn_intrinsics.h" // Partially derived fom hcc_detail/device_functions.h @@ -53,7 +54,7 @@ // bound on how many compute units are available. Some values in this // range may never be returned if there are fewer than 2^CU_ID_SIZE CUs. -DEVICE uint32_t __kmpc_impl_smid() { +EXTERN uint32_t __kmpc_impl_smid() { uint32_t cu_id = __builtin_amdgcn_s_getreg( ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID)); uint32_t se_id = __builtin_amdgcn_s_getreg( diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -22,11 +22,9 @@ #include #include -#define DEVICE __attribute__((device)) +#define DEVICE #define INLINE inline DEVICE #define NOINLINE __attribute__((noinline)) DEVICE -#define SHARED(NAME) __attribute__((shared)) NAME -#define EXTERN_SHARED(NAME) __attribute__((shared)) NAME #define ALIGN(N) __attribute__((aligned(N))) //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -11,7 +11,10 @@ //===----------------------------------------------------------------------===// #pragma omp declare target +#include "amdgcn_intrinsics.h" +#include "common/omptarget.h" #include "target_impl.h" +#include "target_interface.h" // Implementations initially derived from hcc @@ -66,11 +69,12 @@ return __builtin_amdgcn_ds_bpermute(index << 2, var); } -static DEVICE SHARED uint32_t L1_Barrier; +uint32_t __kmpc_L1_Barrier [[clang::loader_uninitialized]]; +#pragma allocate(__kmpc_L1_Barrier) allocator(omp_pteam_mem_alloc) EXTERN void __kmpc_impl_target_init() { // Don't have global ctors, and shared memory is not zero init - __atomic_store_n(&L1_Barrier, 0u, __ATOMIC_RELEASE); + __atomic_store_n(&__kmpc_L1_Barrier, 0u, __ATOMIC_RELEASE); } EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) { @@ -94,8 +98,8 @@ bool isLowest = GetLaneId() == lowestActiveThread; if (isLowest) { - uint32_t load = - __atomic_fetch_add(&L1_Barrier, 1, __ATOMIC_RELAXED); // commutative + uint32_t load = __atomic_fetch_add(&__kmpc_L1_Barrier, 1, + __ATOMIC_RELAXED); // commutative // Record the number of times the barrier has been passed uint32_t generation = load & 0xffff0000u; @@ -107,12 +111,12 @@ load &= 0xffff0000u; // because bits zeroed second // Reset the wave counter and release the waiting waves - __atomic_store_n(&L1_Barrier, load, __ATOMIC_RELAXED); + __atomic_store_n(&__kmpc_L1_Barrier, load, __ATOMIC_RELAXED); } else { // more waves still to go, spin until generation counter changes do { __builtin_amdgcn_s_sleep(0); - load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED); + load = __atomic_load_n(&__kmpc_L1_Barrier, __ATOMIC_RELAXED); } while ((load & 0xffff0000u) == generation); } } diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -337,8 +337,9 @@ //////////////////////////////////////////////////////////////////////////////// INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); } - INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); } +INLINE uint32_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); } +INLINE uint32_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); } #include "common/omptargeti.h"