diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -22,6 +22,8 @@ #define DEVICE __attribute__((device)) #define INLINE inline DEVICE #define NOINLINE __attribute__((noinline)) DEVICE +#define SHARED __attribute__((shared)) +#define ALIGN(N) __attribute__((aligned(N))) //////////////////////////////////////////////////////////////////////////////// // Kernel options diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -77,7 +77,7 @@ uint32_t nArgs; }; -extern __device__ __shared__ omptarget_nvptx_SharedArgs +extern DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; // Data structure to keep in shared memory that traces the current slot, stack, @@ -107,7 +107,7 @@ void *DataEnd; char Data[DS_Slot_Size]; }; -extern __device__ __shared__ DataSharingStateTy DataSharingState; +extern DEVICE SHARED DataSharingStateTy DataSharingState; //////////////////////////////////////////////////////////////////////////////// // task ICV and (implicit & explicit) task state @@ -259,9 +259,9 @@ workDescrForActiveParallel; // one, ONLY for the active par uint64_t lastprivateIterBuffer; - __align__(16) - __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE]; - __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; + ALIGN(16) + __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE]; + ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; }; //////////////////////////////////////////////////////////////////////////////// @@ -326,7 +326,7 @@ /// Memory manager for statically allocated memory. class omptarget_nvptx_SimpleMemoryManager { private: - __align__(128) struct MemDataTy { + ALIGN(128) struct MemDataTy { volatile unsigned keys[OMP_STATE_COUNT]; } MemData[MAX_SM]; @@ -345,20 +345,20 @@ // global data tables //////////////////////////////////////////////////////////////////////////////// -extern __device__ omptarget_nvptx_SimpleMemoryManager +extern DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; -extern __device__ __shared__ uint32_t usedMemIdx; -extern __device__ __shared__ uint32_t usedSlotIdx; -extern __device__ __shared__ uint8_t +extern DEVICE SHARED uint32_t usedMemIdx; +extern DEVICE SHARED uint32_t usedSlotIdx; +extern DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -extern __device__ __shared__ uint16_t threadLimit; -extern __device__ __shared__ uint16_t threadsInTeam; -extern __device__ __shared__ uint16_t nThreads; -extern __device__ __shared__ +extern DEVICE SHARED uint16_t threadLimit; +extern DEVICE SHARED uint16_t threadsInTeam; +extern DEVICE SHARED uint16_t nThreads; +extern DEVICE SHARED omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; -extern __device__ __shared__ uint32_t execution_param; -extern __device__ __shared__ void *ReductionScratchpadPtr; +extern DEVICE SHARED uint32_t execution_param; +extern DEVICE SHARED void *ReductionScratchpadPtr; //////////////////////////////////////////////////////////////////////////////// // work function (outlined parallel/simd functions) and arguments. @@ -366,7 +366,7 @@ //////////////////////////////////////////////////////////////////////////////// typedef void *omptarget_nvptx_WorkFn; -extern volatile __device__ __shared__ omptarget_nvptx_WorkFn +extern volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn; //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -17,7 +17,7 @@ // global data tables //////////////////////////////////////////////////////////////////////////////// -extern __device__ +extern DEVICE omptarget_nvptx_Queue omptarget_nvptx_device_State[MAX_SM]; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -97,7 +97,7 @@ DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n"); - // If the runtime has been elided, used __shared__ memory for master-worker + // If the runtime has been elided, used shared memory for master-worker // data sharing. if (!IsOMPRuntimeInitialized) return (void *)&DataSharingState; @@ -300,7 +300,7 @@ int16_t IsOMPRuntimeInitialized) { DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n"); - // If the runtime has been elided, use __shared__ memory for master-worker + // If the runtime has been elided, use shared memory for master-worker // data sharing. We're reusing the statically allocated data structure // that is used for standard data sharing. if (!IsOMPRuntimeInitialized) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -17,27 +17,27 @@ // global device environment //////////////////////////////////////////////////////////////////////////////// -__device__ omptarget_device_environmentTy omptarget_device_environment; +DEVICE omptarget_device_environmentTy omptarget_device_environment; //////////////////////////////////////////////////////////////////////////////// // global data holding OpenMP state information //////////////////////////////////////////////////////////////////////////////// -__device__ +DEVICE omptarget_nvptx_Queue omptarget_nvptx_device_State[MAX_SM]; -__device__ omptarget_nvptx_SimpleMemoryManager +DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; -__device__ __shared__ uint32_t usedMemIdx; -__device__ __shared__ uint32_t usedSlotIdx; +DEVICE SHARED uint32_t usedMemIdx; +DEVICE SHARED uint32_t usedSlotIdx; -__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -__device__ __shared__ uint16_t threadLimit; -__device__ __shared__ uint16_t threadsInTeam; -__device__ __shared__ uint16_t nThreads; +DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +DEVICE SHARED uint16_t threadLimit; +DEVICE SHARED uint16_t threadsInTeam; +DEVICE SHARED uint16_t nThreads; // Pointer to this team's OpenMP state object -__device__ __shared__ +DEVICE SHARED omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; //////////////////////////////////////////////////////////////////////////////// @@ -45,24 +45,24 @@ // communicate with the workers. Since it is in shared memory, there is one // copy of these variables for each kernel, instance, and team. //////////////////////////////////////////////////////////////////////////////// -volatile __device__ __shared__ omptarget_nvptx_WorkFn omptarget_nvptx_workFn; +volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn; //////////////////////////////////////////////////////////////////////////////// // OpenMP kernel execution parameters //////////////////////////////////////////////////////////////////////////////// -__device__ __shared__ uint32_t execution_param; +DEVICE SHARED uint32_t execution_param; //////////////////////////////////////////////////////////////////////////////// // Data sharing state //////////////////////////////////////////////////////////////////////////////// -__device__ __shared__ DataSharingStateTy DataSharingState; +DEVICE SHARED DataSharingStateTy DataSharingState; //////////////////////////////////////////////////////////////////////////////// // Scratchpad for teams reduction. //////////////////////////////////////////////////////////////////////////////// -__device__ __shared__ void *ReductionScratchpadPtr; +DEVICE SHARED void *ReductionScratchpadPtr; //////////////////////////////////////////////////////////////////////////////// // Data sharing related variables. //////////////////////////////////////////////////////////////////////////////// -__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; +DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -233,7 +233,7 @@ : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel(); - __shared__ volatile bool IsLastTeam; + SHARED volatile bool IsLastTeam; // Team masters of all teams write to the scratchpad. if (ThreadId == 0) { @@ -403,8 +403,8 @@ return (s & ~(unsigned)(WARPSIZE - 1)); } -__device__ static volatile uint32_t IterCnt = 0; -__device__ static volatile uint32_t Cnt = 0; +DEVICE static volatile uint32_t IterCnt = 0; +DEVICE static volatile uint32_t Cnt = 0; EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( kmp_Ident *loc, int32_t global_tid, void *global_buffer, int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, @@ -426,8 +426,8 @@ : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel(); - __shared__ unsigned Bound; - __shared__ unsigned ChunkTeamCount; + SHARED unsigned Bound; + SHARED unsigned ChunkTeamCount; // Block progress for teams greater than the current upper // limit. We always only allow a number of teams less or equal diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -18,6 +18,8 @@ #define DEVICE __device__ #define INLINE __forceinline__ DEVICE #define NOINLINE __noinline__ DEVICE +#define SHARED __shared__ +#define ALIGN(N) __align__(N) //////////////////////////////////////////////////////////////////////////////// // Kernel options