diff --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h --- a/openmp/libomptarget/DeviceRTL/include/Mapping.h +++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h @@ -18,7 +18,7 @@ namespace mapping { -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) inline constexpr uint32_t MaxThreadsPerTeam = 1024; diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h --- a/openmp/libomptarget/DeviceRTL/include/State.h +++ b/openmp/libomptarget/DeviceRTL/include/State.h @@ -15,7 +15,7 @@ #include "Debug.h" #include "Types.h" -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) namespace _OMP { diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp --- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -18,7 +18,7 @@ using namespace _OMP; -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) // defined by CGOpenMPRuntimeGPU extern uint32_t __omp_rtl_debug_kind; diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp --- a/openmp/libomptarget/DeviceRTL/src/Debug.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp @@ -18,7 +18,7 @@ using namespace _OMP; -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) extern "C" { void __assert_assume(bool condition) { __builtin_assume(condition); } @@ -30,6 +30,10 @@ __builtin_trap(); } +namespace impl { +int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t); +} + #pragma omp begin declare variant match( \ device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) int32_t vprintf(const char *, void *); @@ -55,8 +59,7 @@ } /// Current indentation level for the function trace. Only accessed by thread 0. -__attribute__((loader_uninitialized)) -static uint32_t Level; +__attribute__((loader_uninitialized)) static uint32_t Level; #pragma omp allocate(Level) allocator(omp_pteam_mem_alloc) DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line, diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp --- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -19,7 +19,7 @@ using namespace _OMP; -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) static void inititializeRuntime(bool IsSPMD) { // Order is important here. diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -15,7 +15,7 @@ #include "Types.h" #include "Utils.h" -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) #include "llvm/Frontend/OpenMP/OMPGridValues.h" @@ -24,6 +24,23 @@ namespace _OMP { namespace impl { +// Forward declarations defined to be defined for AMDGCN and NVPTX. +const llvm::omp::GV &getGridValue(); +uint32_t getGridDim(uint32_t n, uint16_t d); +uint32_t getWorkgroupDim(uint32_t group_id, uint32_t grid_size, + uint16_t group_size); +uint32_t getNumHardwareThreadsInBlock(); +LaneMaskTy activemask(); +LaneMaskTy lanemaskLT(); +LaneMaskTy lanemaskGT(); +uint32_t getThreadIdInWarp(); +uint32_t getThreadIdInBlock(); +uint32_t getKernelSize(); +uint32_t getBlockId(); +uint32_t getNumberOfBlocks(); +uint32_t getWarpId(); +uint32_t getNumberOfWarpsInBlock(); + /// AMDGCN Implementation /// ///{ diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp --- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp @@ -13,11 +13,15 @@ #include "Debug.h" -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) namespace _OMP { namespace impl { +double getWTick(); + +double getWTime(); + /// AMDGCN Implementation /// ///{ diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp --- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -42,7 +42,7 @@ using namespace _OMP; -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) namespace { diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp --- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp @@ -22,7 +22,7 @@ namespace { -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -19,7 +19,7 @@ using namespace _OMP; -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) /// Memory implementation /// diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -19,7 +19,7 @@ #include "Types.h" #include "Utils.h" -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) using namespace _OMP; @@ -63,6 +63,22 @@ } ///} +// Forward declarations defined to be defined for AMDGCN and NVPTX. +uint32_t atomicInc(uint32_t *A, uint32_t V, int Ordering); +void namedBarrierInit(); +void namedBarrier(); +void fenceTeam(int Ordering); +void fenceKernel(int Ordering); +void fenceSystem(int Ordering); +void syncWarp(__kmpc_impl_lanemask_t); +void syncThreads(); +void syncThreadsAligned() { syncThreads(); } +void unsetLock(omp_lock_t *); +int testLock(omp_lock_t *); +void initLock(omp_lock_t *); +void destroyLock(omp_lock_t *); +void setLock(omp_lock_t *); + /// AMDGCN Implementation /// ///{ diff --git a/openmp/libomptarget/DeviceRTL/src/Tasking.cpp b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp --- a/openmp/libomptarget/DeviceRTL/src/Tasking.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp @@ -20,7 +20,7 @@ using namespace _OMP; -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t, uint64_t TaskSizeInclPrivateValues, diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp --- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -15,7 +15,7 @@ #include "Interface.h" #include "Mapping.h" -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) using namespace _OMP; @@ -32,6 +32,9 @@ namespace impl { +void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits); +uint64_t Pack(uint32_t LowBits, uint32_t HighBits); + /// AMDGCN Implementation /// ///{ @@ -72,6 +75,10 @@ #pragma omp end declare variant +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane); +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, + int32_t Width); + /// AMDGCN Implementation /// ///{ diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp --- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp @@ -43,7 +43,7 @@ #define NOT_FINISHED 1 #define LAST_CHUNK 2 -#pragma omp declare target +#pragma omp begin declare target device_type(nohost) // TODO: This variable is a hack inherited from the old runtime. static uint64_t SHARED(Cnt);