diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/cancel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/cancel.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/cancel.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/cancel.cu @@ -16,12 +16,12 @@ int32_t cancelVal) { PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal); // disabled - return FALSE; + return 0; } EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, int32_t cancelVal) { PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal); // disabled - return FALSE; + return 0; } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h @@ -126,12 +126,12 @@ #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING #include -#include "option.h" +#include "target_impl.h" template NOINLINE static void log(const char *fmt, Arguments... parameters) { - printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), - (int)(threadIdx.x & 0x1F), parameters...); + printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(__kmpcImplGetWarpId()), + (int)(__kmpcImplGetThreadIdInWarp()), parameters...); } #endif @@ -142,9 +142,8 @@ NOINLINE static void check(bool cond, const char *fmt, Arguments... parameters) { if (!cond) - printf(fmt, (int)blockIdx.x, (int)threadIdx.x, - (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F), - parameters...); + printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(__kmpcImplGetWarpId()), + (int)(__kmpcImplGetThreadIdInWarp()), parameters...); assert(cond); } @@ -195,7 +194,7 @@ } #else -#define DON(_flag) (FALSE) +#define DON(_flag) (0) #define PRINT0(flag, str) #define PRINT(flag, str, _args...) @@ -247,7 +246,7 @@ #else -#define TON(_flag) (FALSE) +#define TON(_flag) (0) #define ASSERT0(_flag, _cond, _str) #define ASSERT(_flag, _cond, _str, _args...) @@ -279,7 +278,7 @@ #else -#define WON(_flag) (FALSE) +#define WON(_flag) (0) #define WARNING0(_flag, _str) #define WARNING(_flag, _str, _args...) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -286,7 +286,7 @@ } EXTERN int omp_get_cancellation(void) { - int rc = FALSE; // currently false only + int rc = 0; PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc); return rc; } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -170,7 +170,7 @@ break; } default: { - ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", (int)schedtype); + ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n", (int)schedtype); ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -25,9 +25,9 @@ #include // local includes +#include "target_impl.h" #include "debug.h" // debug #include "interface.h" // interfaces with omp, compiler, and user -#include "option.h" // choices we have #include "state-queue.h" #include "support.h" diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h b/openmp/libomptarget/deviceRTLs/nvptx/src/option.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h +++ /dev/null @@ -1,68 +0,0 @@ -//===------------ option.h - NVPTX OpenMP GPU options ------------ CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// GPU default options -// -//===----------------------------------------------------------------------===// -#ifndef _OPTION_H_ -#define _OPTION_H_ - -#include "interface.h" - -//////////////////////////////////////////////////////////////////////////////// -// Kernel options -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// The following def must match the absolute limit hardwired in the host RTL -// max number of threads per team -#define MAX_THREADS_PER_TEAM 1024 - -#define WARPSIZE 32 - -// The named barrier for active parallel threads of a team in an L1 parallel -// region to synchronize with each other. -#define L1_BARRIER (1) - -// Maximum number of preallocated arguments to an outlined parallel/simd function. -// Anything more requires dynamic memory allocation. -#define MAX_SHARED_ARGS 20 - -// Maximum number of omp state objects per SM allocated statically in global -// memory. -#if __CUDA_ARCH__ >= 700 -#define OMP_STATE_COUNT 32 -#define MAX_SM 84 -#elif __CUDA_ARCH__ >= 600 -#define OMP_STATE_COUNT 32 -#define MAX_SM 56 -#else -#define OMP_STATE_COUNT 16 -#define MAX_SM 16 -#endif - -#define OMP_ACTIVE_PARALLEL_LEVEL 128 - -//////////////////////////////////////////////////////////////////////////////// -// algo options -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// misc options (by def everythig here is device) -//////////////////////////////////////////////////////////////////////////////// - -#define INLINE __forceinline__ __device__ -#define NOINLINE __noinline__ __device__ -#ifndef TRUE -#define TRUE 1 -#endif -#ifndef FALSE -#define FALSE 0 -#endif - -#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -460,7 +460,7 @@ EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid, int32_t num_teams, int32_t thread_limit) { PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams); - ASSERT0(LT_FUSSY, FALSE, + ASSERT0(LT_FUSSY, 0, "should never have anything with new teams on device"); } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h b/openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h @@ -21,7 +21,7 @@ #include -#include "option.h" // choices we have +#include "target_impl.h" template class omptarget_nvptx_Queue { private: diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -14,7 +14,42 @@ #include -#include "option.h" +//////////////////////////////////////////////////////////////////////////////// +// Kernel options +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// The following def must match the absolute limit hardwired in the host RTL +// max number of threads per team +#define MAX_THREADS_PER_TEAM 1024 + +#define WARPSIZE 32 + +// The named barrier for active parallel threads of a team in an L1 parallel +// region to synchronize with each other. +#define L1_BARRIER (1) + +// Maximum number of preallocated arguments to an outlined parallel/simd function. +// Anything more requires dynamic memory allocation. +#define MAX_SHARED_ARGS 20 + +// Maximum number of omp state objects per SM allocated statically in global +// memory. +#if __CUDA_ARCH__ >= 700 +#define OMP_STATE_COUNT 32 +#define MAX_SM 84 +#elif __CUDA_ARCH__ >= 600 +#define OMP_STATE_COUNT 32 +#define MAX_SM 56 +#else +#define OMP_STATE_COUNT 16 +#define MAX_SM 16 +#endif + +#define OMP_ACTIVE_PARALLEL_LEVEL 128 + +#define INLINE __forceinline__ __device__ +#define NOINLINE __noinline__ __device__ INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); @@ -97,4 +132,13 @@ #endif // CUDA_VERSION } +INLINE uint32_t __kmpcImplGetWarpId() { + return threadIdx.x / WARPSIZE; +} + +INLINE uint32_t __kmpcImplGetThreadIdInWarp() { + int threadIdxMask = WARPSIZE - 1; + return threadIdx.x & threadIdxMask; +} + #endif