diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake --- a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake +++ b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake @@ -15,7 +15,7 @@ if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") - set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_CXX_COMPILER}) else() return() endif() diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h --- a/openmp/libomptarget/deviceRTLs/common/debug.h +++ b/openmp/libomptarget/deviceRTLs/common/debug.h @@ -129,15 +129,18 @@ #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING #include "common/support.h" +#pragma omp declare target template NOINLINE static void log(const char *fmt, Arguments... parameters) { printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), (int)GetWarpId(), (int)GetLaneId(), parameters...); } +#pragma omp end declare target #endif #if OMPTARGET_NVPTX_TEST +#pragma omp declare target template NOINLINE static void check(bool cond, const char *fmt, Arguments... parameters) { @@ -148,6 +151,7 @@ } NOINLINE static void check(bool cond) { assert(cond); } +#pragma omp end declare target #endif // set flags that are tested (inclusion properties) diff --git a/openmp/libomptarget/deviceRTLs/common/device_environment.h b/openmp/libomptarget/deviceRTLs/common/device_environment.h --- a/openmp/libomptarget/deviceRTLs/common/device_environment.h +++ b/openmp/libomptarget/deviceRTLs/common/device_environment.h @@ -15,10 +15,14 @@ #include "target_impl.h" +#pragma omp declare target + struct omptarget_device_environmentTy { int32_t debug_level; }; extern DEVICE omptarget_device_environmentTy omptarget_device_environment; +#pragma omp end declare target + #endif diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -34,6 +34,8 @@ #define BARRIER_COUNTER 0 #define ORDERED_COUNTER 1 +#pragma omp declare target + // arguments needed for L0 parallelism only. class omptarget_nvptx_SharedArgs { public: @@ -273,9 +275,9 @@ /// Memory manager for statically allocated memory. class omptarget_nvptx_SimpleMemoryManager { private: - ALIGN(128) struct MemDataTy { + struct MemDataTy { volatile unsigned keys[OMP_STATE_COUNT]; - } MemData[MAX_SM]; + } MemData[MAX_SM] ALIGN(128); INLINE static uint32_t hash(unsigned key) { return key & (OMP_STATE_COUNT - 1); @@ -326,6 +328,8 @@ getMyTopTaskDescriptor(bool isSPMDExecutionMode); INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); +#pragma omp end declare target + //////////////////////////////////////////////////////////////////////////////// // inlined implementation //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h --- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h +++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h @@ -13,6 +13,8 @@ #include "common/target_atomic.h" +#pragma omp declare target + //////////////////////////////////////////////////////////////////////////////// // Task Descriptor //////////////////////////////////////////////////////////////////////////////// @@ -226,3 +228,5 @@ usedMemIdx = i; return static_cast(buf) + (sm * OMP_STATE_COUNT + i) * size; } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu @@ -13,6 +13,8 @@ #include "interface.h" #include "common/debug.h" +#pragma omp declare target + EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, int32_t cancelVal) { PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal); @@ -26,3 +28,5 @@ // disabled return 0; } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/critical.cu b/openmp/libomptarget/deviceRTLs/common/src/critical.cu --- a/openmp/libomptarget/deviceRTLs/common/src/critical.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/critical.cu @@ -13,6 +13,8 @@ #include "interface.h" #include "common/debug.h" +#pragma omp declare target + EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, kmp_CriticalName *lck) { @@ -26,3 +28,5 @@ PRINT0(LD_IO, "call to kmpc_end_critical()\n"); omp_unset_lock((omp_lock_t *)lck); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -12,6 +12,8 @@ #include "common/omptarget.h" #include "target_impl.h" +#pragma omp declare target + // Return true if this is the master thread. INLINE static bool IsMasterThread(bool isSPMDExecutionMode) { return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock(); @@ -275,3 +277,4 @@ omptarget_nvptx_simpleMemoryManager.Release(); } +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -15,6 +15,8 @@ #include "common/target_atomic.h" #include "target_impl.h" +#pragma omp declare target + EXTERN double omp_get_wtick(void) { double rc = __kmpc_impl_get_wtick(); PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc); @@ -316,10 +318,10 @@ return rc; } -EXTERN int omp_is_initial_device(void) { - PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); - return 0; // 0 by def on device -} +// EXTERN int omp_is_initial_device(void) { +// PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); +// return 0; // 0 by def on device +// } // Unspecified on the device. EXTERN int omp_get_initial_device(void) { @@ -362,3 +364,5 @@ PRINT(LD_IO, "call omp_test_lock() return %d\n", rc); return rc; } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -16,6 +16,8 @@ #include "target_impl.h" #include "common/target_atomic.h" +#pragma omp declare target + //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // template class that encapsulate all the helper functions @@ -754,3 +756,5 @@ EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) { PRINT0(LD_IO, "call kmpc_for_static_fini\n"); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu @@ -13,6 +13,8 @@ #include "common/omptarget.h" #include "common/device_environment.h" +#pragma omp declare target + //////////////////////////////////////////////////////////////////////////////// // global device environment //////////////////////////////////////////////////////////////////////////////// @@ -66,3 +68,5 @@ // Data sharing related variables. //////////////////////////////////////////////////////////////////////////////// DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -13,6 +13,8 @@ #include "common/omptarget.h" #include "target_impl.h" +#pragma omp declare target + //////////////////////////////////////////////////////////////////////////////// // global data tables //////////////////////////////////////////////////////////////////////////////// @@ -157,3 +159,5 @@ PRINT0(LD_IO | LD_PAR, "call to __kmpc_is_spmd_exec_mode\n"); return isSPMDMode(); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -35,6 +35,8 @@ #include "common/omptarget.h" #include "target_impl.h" +#pragma omp declare target + //////////////////////////////////////////////////////////////////////////////// // support for parallel that goes parallel (1 static level only) //////////////////////////////////////////////////////////////////////////////// @@ -300,3 +302,5 @@ int proc_bind) { PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -14,6 +14,8 @@ #include "common/target_atomic.h" #include "target_impl.h" +#pragma omp declare target + EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid) {} @@ -312,3 +314,4 @@ return 0; } +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -14,6 +14,8 @@ #include "common/debug.h" #include "common/omptarget.h" +#pragma omp declare target + //////////////////////////////////////////////////////////////////////////////// // Execution Parameters //////////////////////////////////////////////////////////////////////////////// @@ -264,3 +266,4 @@ return static_cast(ReductionScratchpadPtr) + 256; } +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu --- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu @@ -13,6 +13,8 @@ #include "common/omptarget.h" #include "target_impl.h" +#pragma omp declare target + //////////////////////////////////////////////////////////////////////////////// // KMP Ordered calls //////////////////////////////////////////////////////////////////////////////// @@ -135,3 +137,5 @@ PRINT0(LD_IO, "call __kmpc_syncwarp\n"); __kmpc_impl_syncwarp(Mask); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/task.cu b/openmp/libomptarget/deviceRTLs/common/src/task.cu --- a/openmp/libomptarget/deviceRTLs/common/src/task.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/task.cu @@ -29,6 +29,8 @@ #include "common/omptarget.h" +#pragma omp declare target + EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc( kmp_Ident *loc, // unused uint32_t global_tid, // unused @@ -214,3 +216,5 @@ __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0); } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/state-queue.h b/openmp/libomptarget/deviceRTLs/common/state-queue.h --- a/openmp/libomptarget/deviceRTLs/common/state-queue.h +++ b/openmp/libomptarget/deviceRTLs/common/state-queue.h @@ -23,6 +23,8 @@ #include "target_impl.h" +#pragma omp declare target + template class omptarget_nvptx_Queue { private: ElementType elements[SIZE]; @@ -46,6 +48,8 @@ INLINE ElementType *Dequeue(); }; +#pragma omp end declare target + #include "state-queuei.h" #endif diff --git a/openmp/libomptarget/deviceRTLs/common/state-queuei.h b/openmp/libomptarget/deviceRTLs/common/state-queuei.h --- a/openmp/libomptarget/deviceRTLs/common/state-queuei.h +++ b/openmp/libomptarget/deviceRTLs/common/state-queuei.h @@ -19,6 +19,8 @@ #include "state-queue.h" #include "common/target_atomic.h" +#pragma omp declare target + template INLINE uint32_t omptarget_nvptx_Queue::ENQUEUE_TICKET() { return __kmpc_atomic_add((unsigned int *)&tail, 1u); @@ -88,3 +90,5 @@ DoneServing(slot, id); return element; } + +#pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -16,6 +16,8 @@ #include "interface.h" #include "target_impl.h" +#pragma omp declare target + //////////////////////////////////////////////////////////////////////////////// // Execution Parameters //////////////////////////////////////////////////////////////////////////////// @@ -95,4 +97,6 @@ DEVICE unsigned int *GetTeamsReductionTimestamp(); DEVICE char *GetTeamsReductionScratchpad(); +#pragma omp end declare target + #endif diff --git a/openmp/libomptarget/deviceRTLs/common/target_atomic.h b/openmp/libomptarget/deviceRTLs/common/target_atomic.h --- a/openmp/libomptarget/deviceRTLs/common/target_atomic.h +++ b/openmp/libomptarget/deviceRTLs/common/target_atomic.h @@ -15,6 +15,16 @@ #include "target_impl.h" +#pragma omp declare target + +// We declared these template functions here. Their definition is in +// __clang_cuda_device_functions.h. +template T atomicAdd(T *, T); +template T atomicInc(T *, T); +template T atomicMax(T *, T); +template T atomicExch(T *, T); +template T atomicCAS(T *, T, T); + template INLINE T __kmpc_atomic_add(T *address, T val) { return atomicAdd(address, val); } @@ -35,4 +45,6 @@ return atomicCAS(address, compare, val); } +#pragma omp end declare target + #endif diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -26,6 +26,8 @@ #include "nvptx/src/nvptx_interface.h" #endif +#pragma omp declare target + //////////////////////////////////////////////////////////////////////////////// // OpenMP interface //////////////////////////////////////////////////////////////////////////////// @@ -448,4 +450,6 @@ EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, int16_t is_shared); +#pragma omp end declare target + #endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -10,6 +10,19 @@ # ##===----------------------------------------------------------------------===## +# TODO: This part needs to be refined when libomptarget is going to support +# Windows! +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") + set(aux_triple x86_64-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le") + set(aux_triple powerpc64le-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(aux_triple aarch64-unknown-linux-gnu) +else() + libomptarget_say("Not building CUDA offloading device RTL: unknow host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}") + return() +endif() + get_filename_component(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) @@ -79,61 +92,81 @@ ) # Set flags for LLVM Bitcode compilation. - set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} + set(bc_flags -S -x c++ + -target nvptx64 + -Xclang -emit-llvm-bc + -Xclang -aux-triple -Xclang ${aux_triple} + -fopenmp -Xclang -fopenmp-is-device + -D__CUDACC__ -I${devicertl_base_directory} - -I${devicertl_nvptx_directory}/src) + -I${devicertl_nvptx_directory}/src + -I${CUDA_TOOLKIT_ROOT_DIR}/include + -fdeclspec + -include openmp_wrappers/__clang_openmp_device_functions.h) if(${LIBOMPTARGET_NVPTX_DEBUG}) - set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1) + list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1) else() - set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0) + list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0) endif() # Create target to build all Bitcode libraries. add_custom_target(omptarget-nvptx-bc) - # Generate a Bitcode library for all the compute capabilities the user requested. + # That's all PTX versions we know for now + set(nvptx_ptx_list 50 60 61 62 63 64 65 70 71) + + # Generate a Bitcode library for all the compute capabilities the user + # requested and all PTX version we know for now. foreach(sm ${nvptx_sm_list}) - set(cuda_arch --cuda-gpu-arch=sm_${sm}) - - # Compile CUDA files to bitcode. - set(bc_files "") - foreach(src ${cuda_src_files}) - get_filename_component(infile ${src} ABSOLUTE) - get_filename_component(outfile ${src} NAME) - - add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION} - -c ${infile} -o ${outfile}-sm_${sm}.bc - DEPENDS ${infile} - IMPLICIT_DEPENDS CXX ${infile} - COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc" - VERBATIM + set(cuda_arch -Xclang -target-cpu -Xclang sm_${sm}) + + foreach(ptx_num ${nvptx_ptx_list}) + set(ptx_version -Xclang -target-feature -Xclang +ptx${ptx_num}) + + set(bc_files "") + foreach(src ${cuda_src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + set(outfile "${outfile}-sm_${sm}-ptx${ptx_num}.bc") + + add_custom_command(OUTPUT ${outfile} + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} + ${cuda_arch} ${MAX_SM_DEFINITION} ${infile} -o ${outfile} + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}" + VERBATIM + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) + + list(APPEND bc_files ${outfile}) + endforeach() + + set(bclib_name "libomptarget-nvptx-sm_${sm}-ptx${ptx_num}.bc") + + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} + -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode ${bclib_name}" ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) - list(APPEND bc_files ${outfile}-sm_${sm}.bc) - endforeach() + set(bclib_target_name "omptarget-nvptx-sm_${sm}-ptx${ptx_num}-bc") + + add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) + add_dependencies(omptarget-nvptx-bc ${bclib_target_name}) - # Link to a bitcode library. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} - -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files} - DEPENDS ${bc_files} - COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc" - ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc) - - add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc) - add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc) - - # Copy library to destination. - add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - ${LIBOMPTARGET_LIBRARY_DIR}) - - # Install bitcode library under the lib destination folder. - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}") + # Copy library to destination. + add_custom_command(TARGET ${bclib_target_name} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + ${LIBOMPTARGET_LIBRARY_DIR}) + + # Install bitcode library under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") + endforeach() endforeach() endif() diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h @@ -11,8 +11,11 @@ #include -#define EXTERN extern "C" __device__ +#define EXTERN extern "C" + +#pragma omp declare target typedef uint32_t __kmpc_impl_lanemask_t; typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ +#pragma omp end declare target #endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -20,11 +20,12 @@ #include "nvptx_interface.h" -#define DEVICE __device__ -#define INLINE __forceinline__ DEVICE -#define NOINLINE __noinline__ DEVICE -#define SHARED __shared__ -#define ALIGN(N) __align__(N) +#define DEVICE +#define INLINE inline __attribute__((always_inline)) +#define NOINLINE __attribute__((noinline)) +// FIXME: This attribute doesn't work here +#define SHARED __attribute__((shared)) +#define ALIGN(N) __attribute__((aligned(N))) //////////////////////////////////////////////////////////////////////////////// // Kernel options @@ -67,6 +68,17 @@ #define OMP_ACTIVE_PARALLEL_LEVEL 128 +#pragma omp declare target + +// FIXME: Forward declaration +extern "C" { +uint32_t __ffs(uint32_t); +uint32_t __popc(uint32_t); +// FIXME: This function is defined in but due to unknown reason we +// cannot use #include directly. +uint32_t min(uint32_t, uint32_t); +} + // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { // The maximum number of workers in a kernel. @@ -146,4 +158,6 @@ DEVICE void *__kmpc_impl_malloc(size_t); DEVICE void __kmpc_impl_free(void *); +#pragma omp end declare target + #endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -16,6 +16,22 @@ #include +#pragma omp declare target + +// FIXME: Forward declaration +extern "C" { +unsigned int __activemask(); +unsigned int __ballot(unsigned); +int __shfl_sync(unsigned mask, int val, int src_line); +int __shfl(int val, int src_line); +int __shfl_down(int var, unsigned detla, int width); +int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width); +void __syncwarp(int mask); +void __threadfence(); +void __threadfence_block(); +void __threadfence_system(); +} + DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); } @@ -158,3 +174,5 @@ DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); } DEVICE void __kmpc_impl_free(void *x) { free(x); } + +#pragma omp end declare target