diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -100,14 +100,18 @@ macro(add_cuda_bc_library) set(cu_cmd ${AOMP_BINDIR}/clang++ -std=c++14 - -fcuda-rdc + -xc++ + -c -fvisibility=default --cuda-device-only + -fopenmp-cuda-mode # skip data sharing prologue on device functions -Wno-unused-value - -x hip - -nogpulib -nogpuinc + -nogpulib + -D__AMDGCN__ # Code uses this to distinguish vs nvptx + -target x86_64-pc-linux-gnu -fopenmp + -fopenmp-targets=amdgcn-amd-amdhsa + -Xopenmp-target=amdgcn-amd-amdhsa -march=${mcpu} -O${optimization_level} - --cuda-gpu-arch=${mcpu} ${CUDA_DEBUG} -I${CMAKE_CURRENT_SOURCE_DIR}/src -I${devicertl_base_directory}) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_intrinsics.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_intrinsics.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_intrinsics.h @@ -0,0 +1,48 @@ +//===--- amdgcn_intrinsics.h - Intrinsics used by deviceRTL ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _AMDGCN_INTRINSICS_H_ +#define _AMDGCN_INTRINSICS_H_ + +#ifndef EXTERN +#error "Expected definition of EXTERN" +#endif + +#include + +#ifdef _OPENMP +// Openmp doesn't pull these builtins into scope, but does error if the type is +// incorrect +// This may be a quirk of openmp's compile for host + device assumption, where +// these don't resolve to anything on the host + +EXTERN uint32_t __builtin_amdgcn_atomic_inc32(volatile uint32_t *, uint32_t, + uint32_t, const char *); +EXTERN void __builtin_amdgcn_s_barrier(void); +EXTERN void __builtin_amdgcn_fence(uint32_t, const char *); + +EXTERN void __builtin_amdgcn_s_sleep(int); + +EXTERN uint32_t __builtin_amdgcn_workitem_id_x(void); +EXTERN uint32_t __builtin_amdgcn_workgroup_id_x(void); +EXTERN uint16_t __builtin_amdgcn_workgroup_size_x(void); +EXTERN uint32_t __builtin_amdgcn_grid_size_x(void); + +EXTERN uint64_t __builtin_amdgcn_s_memrealtime(void); +EXTERN uint32_t __builtin_amdgcn_s_getreg(int32_t); +EXTERN uint64_t __builtin_amdgcn_read_exec(void); + +EXTERN __attribute__((address_space(4))) void * +__builtin_amdgcn_dispatch_ptr() noexcept; + +EXTERN uint32_t __builtin_amdgcn_mbcnt_lo(uint32_t, uint32_t); +EXTERN uint32_t __builtin_amdgcn_mbcnt_hi(uint32_t, uint32_t); +EXTERN int32_t __builtin_amdgcn_ds_bpermute(int32_t, int32_t); +#endif + +#endif diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip @@ -14,6 +14,7 @@ // a SIMD => wavefront mapping once that is implemented. // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "common/debug.h" @@ -26,3 +27,5 @@ DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); } DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); } DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "target_impl.h" @@ -59,3 +60,5 @@ ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID)); return (se_id << HW_ID_CU_ID_SIZE) + cu_id; } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -17,16 +17,54 @@ #endif #include "amdgcn_interface.h" +#include "amdgcn_intrinsics.h" #include #include #include #include +#ifdef _OPENMP +#define DEVICE +#else #define DEVICE __attribute__((device)) +#endif + #define INLINE inline DEVICE #define NOINLINE __attribute__((noinline)) DEVICE -#define SHARED __attribute__((shared)) + +#ifdef _OPENMP + +// Follows the pattern in interface.h +typedef enum omp_allocator_handle_t { + omp_null_allocator = 0, + omp_default_mem_alloc = 1, + omp_large_cap_mem_alloc = 2, + omp_const_mem_alloc = 3, + omp_high_bw_mem_alloc = 4, + omp_low_lat_mem_alloc = 5, + omp_cgroup_mem_alloc = 6, + omp_pteam_mem_alloc = 7, + omp_thread_mem_alloc = 8, + KMP_ALLOCATOR_MAX_HANDLE = UINTPTR_MAX +} omp_allocator_handle_t; + +#define __p(STR) _Pragma(STR) +#define __p2(STR) __p(#STR) + +#define SHARED(NAME) \ + NAME [[clang::loader_uninitialized]]; \ + __p2(omp allocate(NAME) allocator(omp_pteam_mem_alloc)) + +#define EXTERN_SHARED(NAME) \ + NAME; \ + __p2(omp allocate(NAME) allocator(omp_pteam_mem_alloc)) + +#else // HIP +#define SHARED(NAME) __attribute__((shared)) NAME +#define EXTERN_SHARED(NAME) __attribute__((shared)) NAME +#endif + #define ALIGN(N) __attribute__((aligned(N))) #include "hip_atomics.h" diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -9,6 +9,7 @@ // Definitions of target specific functions // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "target_impl.h" @@ -65,7 +66,13 @@ return __builtin_amdgcn_ds_bpermute(index << 2, var); } -static DEVICE SHARED uint32_t L1_Barrier; + +// static doesn't work for openmp + shared, the variable is discarded by llc +// and lld then fails to link. Unclear why the variable hasn't been associated +// with the kernel. Dropping the static qualifier for now. + +// static +DEVICE uint32_t SHARED(L1_Barrier); DEVICE void __kmpc_impl_target_init() { // Don't have global ctors, and shared memory is not zero init @@ -147,3 +154,5 @@ // Stub implementations DEVICE void *__kmpc_impl_malloc(size_t ) { return nullptr } DEVICE void __kmpc_impl_free(void *) {} + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -71,8 +71,8 @@ uint32_t nArgs; }; -extern DEVICE SHARED omptarget_nvptx_SharedArgs - omptarget_nvptx_globalArgs; +extern DEVICE + omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs); // Worker slot type which is initialized with the default worker slot // size of 4*32 bytes. @@ -94,7 +94,7 @@ __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number]; }; -extern DEVICE SHARED DataSharingStateTy DataSharingState; +extern DEVICE DataSharingStateTy EXTERN_SHARED(DataSharingState); //////////////////////////////////////////////////////////////////////////////// // task ICV and (implicit & explicit) task state @@ -294,18 +294,25 @@ extern DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; -extern DEVICE SHARED uint32_t usedMemIdx; -extern DEVICE SHARED uint32_t usedSlotIdx; -extern DEVICE SHARED uint8_t - parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -extern DEVICE SHARED uint16_t threadLimit; -extern DEVICE SHARED uint16_t threadsInTeam; -extern DEVICE SHARED uint16_t nThreads; -extern DEVICE SHARED - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; - -extern DEVICE SHARED uint32_t execution_param; -extern DEVICE SHARED void *ReductionScratchpadPtr; +extern DEVICE uint32_t EXTERN_SHARED(usedMemIdx); +extern DEVICE uint32_t EXTERN_SHARED(usedSlotIdx); + +#if _OPENMP +extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) +#else +extern DEVICE + uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE]; +#endif + +extern DEVICE uint16_t EXTERN_SHARED(threadLimit); +extern DEVICE uint16_t EXTERN_SHARED(threadsInTeam); +extern DEVICE uint16_t EXTERN_SHARED(nThreads); +extern DEVICE omptarget_nvptx_ThreadPrivateContext * + EXTERN_SHARED(omptarget_nvptx_threadPrivateContext); + +extern DEVICE uint32_t EXTERN_SHARED(execution_param); +extern DEVICE void *EXTERN_SHARED(ReductionScratchpadPtr); //////////////////////////////////////////////////////////////////////////////// // work function (outlined parallel/simd functions) and arguments. @@ -313,8 +320,9 @@ //////////////////////////////////////////////////////////////////////////////// typedef void *omptarget_nvptx_WorkFn; -extern volatile DEVICE SHARED omptarget_nvptx_WorkFn - omptarget_nvptx_workFn; + +extern volatile DEVICE + omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn); //////////////////////////////////////////////////////////////////////////////// // get private data structures diff --git a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu @@ -9,6 +9,7 @@ // Interface to be used in the implementation of OpenMP cancel. // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "interface.h" #include "common/debug.h" @@ -26,3 +27,5 @@ // disabled return 0; } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/critical.cu b/openmp/libomptarget/deviceRTLs/common/src/critical.cu --- a/openmp/libomptarget/deviceRTLs/common/src/critical.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/critical.cu @@ -9,6 +9,7 @@ // This file contains the implementation of critical with KMPC interface // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "interface.h" #include "common/debug.h" @@ -26,3 +27,5 @@ PRINT0(LD_IO, "call to kmpc_end_critical()\n"); omp_unset_lock((omp_lock_t *)lck); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -9,6 +9,8 @@ // This file contains the implementation of data sharing environments // //===----------------------------------------------------------------------===// +#pragma omp declare target + #include "common/omptarget.h" #include "target_impl.h" @@ -275,3 +277,4 @@ omptarget_nvptx_simpleMemoryManager.Release(); } +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -10,6 +10,7 @@ // invoked by the user in an OpenMP region // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "common/omptarget.h" #include "common/target_atomic.h" @@ -316,11 +317,21 @@ return rc; } +// For some reason this function, and only this function, triggers +// error: definition of builtin function 'omp_is_initial_device' +// Working around here until the compiler quirk is understood +#ifdef __AMDGCN__ +DEVICE int omp_is_initial_device_OVERLOAD(void) asm("omp_is_initial_device"); +DEVICE int omp_is_initial_device_OVERLOAD(void) { + PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); + return 0; // 0 by def on device +} +#else EXTERN int omp_is_initial_device(void) { PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); return 0; // 0 by def on device } - +#endif // Unspecified on the device. EXTERN int omp_get_initial_device(void) { PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n"); @@ -362,3 +373,5 @@ PRINT(LD_IO, "call omp_test_lock() return %d\n", rc); return rc; } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -11,6 +11,7 @@ // interface as loops. // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "common/omptarget.h" #include "target_impl.h" @@ -754,3 +755,5 @@ EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) { PRINT0(LD_IO, "call kmpc_for_static_fini\n"); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu @@ -9,6 +9,7 @@ // This file contains the data objects used on the GPU device. // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "common/omptarget.h" #include "common/device_environment.h" @@ -29,40 +30,49 @@ DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; -DEVICE SHARED uint32_t usedMemIdx; -DEVICE SHARED uint32_t usedSlotIdx; +DEVICE uint32_t SHARED(usedMemIdx); +DEVICE uint32_t SHARED(usedSlotIdx); -DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -DEVICE SHARED uint16_t threadLimit; -DEVICE SHARED uint16_t threadsInTeam; -DEVICE SHARED uint16_t nThreads; +#ifdef _OPENMP +DEVICE [[clang::loader_uninitialized]] uint8_t + parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) +#else +DEVICE uint8_t SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE]; +#endif + +DEVICE uint16_t SHARED(threadLimit); +DEVICE uint16_t SHARED(threadsInTeam); +DEVICE uint16_t SHARED(nThreads); // Pointer to this team's OpenMP state object -DEVICE SHARED - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; +DEVICE omptarget_nvptx_ThreadPrivateContext * + SHARED(omptarget_nvptx_threadPrivateContext); //////////////////////////////////////////////////////////////////////////////// // The team master sets the outlined parallel function in this variable to // communicate with the workers. Since it is in shared memory, there is one // copy of these variables for each kernel, instance, and team. //////////////////////////////////////////////////////////////////////////////// -volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn; +volatile DEVICE omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn); //////////////////////////////////////////////////////////////////////////////// // OpenMP kernel execution parameters //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED uint32_t execution_param; +DEVICE uint32_t SHARED(execution_param); //////////////////////////////////////////////////////////////////////////////// // Data sharing state //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED DataSharingStateTy DataSharingState; +DEVICE DataSharingStateTy SHARED(DataSharingState); //////////////////////////////////////////////////////////////////////////////// // Scratchpad for teams reduction. //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED void *ReductionScratchpadPtr; +DEVICE void *SHARED(ReductionScratchpadPtr); //////////////////////////////////////////////////////////////////////////////// // Data sharing related variables. //////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; +DEVICE omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs); + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -9,6 +9,7 @@ // This file contains the initialization code for the GPU // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "common/omptarget.h" #include "target_impl.h" @@ -157,3 +158,5 @@ PRINT0(LD_IO | LD_PAR, "call to __kmpc_is_spmd_exec_mode\n"); return isSPMDMode(); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -31,6 +31,7 @@ // To make a long story short... // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "common/omptarget.h" #include "target_impl.h" @@ -300,3 +301,5 @@ int proc_bind) { PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -9,6 +9,7 @@ // This file contains the implementation of reduction with KMPC interface. // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "common/omptarget.h" #include "common/target_atomic.h" @@ -206,8 +207,8 @@ : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel(); - static SHARED unsigned Bound; - static SHARED unsigned ChunkTeamCount; + static unsigned SHARED(Bound); + static unsigned SHARED(ChunkTeamCount); // Block progress for teams greater than the current upper // limit. We always only allow a number of teams less or equal @@ -312,3 +313,4 @@ return 0; } +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -9,6 +9,7 @@ // Wrapper implementation to some functions natively supported by the GPU. // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "common/support.h" #include "common/debug.h" @@ -264,3 +265,4 @@ return static_cast(ReductionScratchpadPtr) + 256; } +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu --- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu @@ -9,6 +9,7 @@ // Include all synchronization. // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "common/omptarget.h" #include "target_impl.h" @@ -135,3 +136,5 @@ PRINT0(LD_IO, "call __kmpc_syncwarp\n"); __kmpc_impl_syncwarp(Mask); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/task.cu b/openmp/libomptarget/deviceRTLs/common/src/task.cu --- a/openmp/libomptarget/deviceRTLs/common/src/task.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/task.cu @@ -26,6 +26,7 @@ // - end // //===----------------------------------------------------------------------===// +#pragma omp declare target #include "common/omptarget.h" @@ -214,3 +215,5 @@ __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -23,7 +23,10 @@ #define DEVICE __device__ #define INLINE __forceinline__ DEVICE #define NOINLINE __noinline__ DEVICE -#define SHARED __shared__ + +#define SHARED(NAME) __shared__ NAME +#define EXTERN_SHARED(NAME) __shared__ NAME + #define ALIGN(N) __align__(N) ////////////////////////////////////////////////////////////////////////////////