diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h @@ -126,12 +126,13 @@ #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING #include -#include "option.h" +#include "target_impl.h" template NOINLINE static void log(const char *fmt, Arguments... parameters) { + int threadIdxMask = WARPSIZE - 1; printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), - (int)(threadIdx.x & 0x1F), parameters...); + (int)(threadIdx.x & threadIdxMask), parameters...); } #endif @@ -141,9 +142,10 @@ template NOINLINE static void check(bool cond, const char *fmt, Arguments... parameters) { + int threadIdxMask = WARPSIZE - 1; if (!cond) printf(fmt, (int)blockIdx.x, (int)threadIdx.x, - (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F), + (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & threadIdxMask), parameters...); assert(cond); } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -25,9 +25,9 @@ #include // local includes +#include "target_impl.h" #include "debug.h" // debug #include "interface.h" // interfaces with omp, compiler, and user -#include "option.h" // choices we have #include "state-queue.h" #include "support.h" diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h b/openmp/libomptarget/deviceRTLs/nvptx/src/option.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h +++ /dev/null @@ -1,68 +0,0 @@ -//===------------ option.h - NVPTX OpenMP GPU options ------------ CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// GPU default options -// -//===----------------------------------------------------------------------===// -#ifndef _OPTION_H_ -#define _OPTION_H_ - -#include "interface.h" - -//////////////////////////////////////////////////////////////////////////////// -// Kernel options -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// The following def must match the absolute limit hardwired in the host RTL -// max number of threads per team -#define MAX_THREADS_PER_TEAM 1024 - -#define WARPSIZE 32 - -// The named barrier for active parallel threads of a team in an L1 parallel -// region to synchronize with each other. -#define L1_BARRIER (1) - -// Maximum number of preallocated arguments to an outlined parallel/simd function. -// Anything more requires dynamic memory allocation. -#define MAX_SHARED_ARGS 20 - -// Maximum number of omp state objects per SM allocated statically in global -// memory. -#if __CUDA_ARCH__ >= 700 -#define OMP_STATE_COUNT 32 -#define MAX_SM 84 -#elif __CUDA_ARCH__ >= 600 -#define OMP_STATE_COUNT 32 -#define MAX_SM 56 -#else -#define OMP_STATE_COUNT 16 -#define MAX_SM 16 -#endif - -#define OMP_ACTIVE_PARALLEL_LEVEL 128 - -//////////////////////////////////////////////////////////////////////////////// -// algo options -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// misc options (by def everythig here is device) -//////////////////////////////////////////////////////////////////////////////// - -#define INLINE __forceinline__ __device__ -#define NOINLINE __noinline__ __device__ -#ifndef TRUE -#define TRUE 1 -#endif -#ifndef FALSE -#define FALSE 0 -#endif - -#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h b/openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h @@ -21,7 +21,7 @@ #include -#include "option.h" // choices we have +#include "target_impl.h" template class omptarget_nvptx_Queue { private: diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -14,7 +14,52 @@ #include -#include "option.h" +//////////////////////////////////////////////////////////////////////////////// +// Kernel options +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// The following def must match the absolute limit hardwired in the host RTL +// max number of threads per team +#define MAX_THREADS_PER_TEAM 1024 + +#define WARPSIZE 32 + +// The named barrier for active parallel threads of a team in an L1 parallel +// region to synchronize with each other. +#define L1_BARRIER (1) + +// Maximum number of preallocated arguments to an outlined parallel/simd function. +// Anything more requires dynamic memory allocation. +#define MAX_SHARED_ARGS 20 + +// Maximum number of omp state objects per SM allocated statically in global +// memory. +#if __CUDA_ARCH__ >= 700 +#define OMP_STATE_COUNT 32 +#define MAX_SM 84 +#elif __CUDA_ARCH__ >= 600 +#define OMP_STATE_COUNT 32 +#define MAX_SM 56 +#else +#define OMP_STATE_COUNT 16 +#define MAX_SM 16 +#endif + +#define OMP_ACTIVE_PARALLEL_LEVEL 128 + +//////////////////////////////////////////////////////////////////////////////// +// misc options (by def everything here is device) +//////////////////////////////////////////////////////////////////////////////// + +#define INLINE __forceinline__ __device__ +#define NOINLINE __noinline__ __device__ +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));