Index: libomptarget/deviceRTLs/nvptx/CMakeLists.txt =================================================================== --- libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -93,7 +93,7 @@ # Find a clang compiler capable of compiling cuda files to LLVM bitcode and # an LLVM linker. # We use the one provided by the user, attempt to use the one used to build - # libomptarget, attempt to use clang in the PATH, or just fail. + # libomptarget or just fail. set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING "Location of a CUDA compiler capable of emitting LLVM bitcode.") @@ -105,8 +105,8 @@ elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) else() - libomptarget_say("Cannot find a CUDA compiler capable of emitting LLVM bitcode.") - libomptarget_say("Please configure with flag -DLIBOMPTARGET_NVPTX_CUDA_COMPILER") + libomptarget_error_say("Cannot find a CUDA compiler capable of emitting LLVM bitcode.") + libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_CUDA_COMPILER") endif() # Get compiler directory to try to locate a suitable linker @@ -118,8 +118,8 @@ # Use llvm-link from the directory containing clang set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${COMPILER_DIR}/llvm-link) else() - libomptarget_say("Cannot find a linker capable of linking LLVM bitcode objects.") - libomptarget_say("Please configure with flag -DLIBOMPTARGET_NVPTX_BC_LINKER") + libomptarget_error_say("Cannot find a linker capable of linking LLVM bitcode objects.") + libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_BC_LINKER") endif() if(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER AND LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER) Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -23,7 +23,7 @@ } // Warp ID in the CUDA block __device__ static unsigned getWarpId() { - return threadIdx.x >> DS_Max_Worker_Warp_Size_Bits; + return threadIdx.x / WARPSIZE; } // The CUDA thread ID of the master thread. @@ -300,7 +300,7 @@ // Get the frame used by the requested thread. - unsigned SourceWID = SourceThreadID >> DS_Max_Worker_Warp_Size_Bits; + unsigned SourceWID = SourceThreadID / WARPSIZE; DSPRINT(DSFLAG,"Source warp: %d\n", SourceWID); Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -42,12 +42,6 @@ return id; } -INLINE unsigned n_sm() { - unsigned n_sm; - asm("mov.u32 %0, %%nsmid;" : "=r"(n_sm)); - return n_sm; -} - EXTERN void __kmpc_kernel_init_params(void *Ptr) { PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n", OMPTARGET_NVPTX_VERSION); @@ -172,7 +166,7 @@ if (RequiresDataSharing && threadId % WARPSIZE == 0) { // Warp master innitializes data sharing environment. - unsigned WID = threadId >> DS_Max_Worker_Warp_Size_Bits; + unsigned WID = threadId / WARPSIZE; __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(WID); DataSharingState.SlotPtr[WID] = RootS; DataSharingState.StackPtr[WID] = (void*)&RootS->Data[0]; Index: libomptarget/deviceRTLs/nvptx/src/option.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/option.h +++ libomptarget/deviceRTLs/nvptx/src/option.h @@ -19,10 +19,8 @@ //////////////////////////////////////////////////////////////////////////////// // The following def must match the absolute limit hardwired in the host RTL -#define THREAD_ABSOLUTE_LIMIT 1024 /* omptx limit (must match threadAbsoluteLimit) */ - // max number of threads per team -#define MAX_THREADS_PER_TEAM THREAD_ABSOLUTE_LIMIT +#define MAX_THREADS_PER_TEAM 1024 #define WARPSIZE 32 Index: libomptarget/deviceRTLs/nvptx/src/reduction.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -234,8 +234,8 @@ // Get the OMP thread Id. This is different from BlockThreadId in the case of // an L2 parallel region. return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized) == 0; -} #endif // __CUDA_ARCH__ >= 700 +} EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait( @@ -382,10 +382,10 @@ if (WarpId == 0) gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); } +#endif // __CUDA_ARCH__ >= 700 return ThreadId == 0; } -#endif // __CUDA_ARCH__ >= 700 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,