diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -81,6 +81,7 @@ ${ENABLE_LIBOMPTARGET}) option(OPENMP_ENABLE_LIBOMPTARGET_PROFILING "Enable time profiling for libomptarget." ${ENABLE_LIBOMPTARGET}) +option(OPENMP_ENABLE_LIBOMP_PROFILING "Enable time profiling for libomp." OFF) # Build host runtime library, after LIBOMPTARGET variables are set since they are needed # to enable time profiling support in the OpenMP runtime. diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -3,7 +3,7 @@ LLVM/OpenMP Runtimes ==================== -There are four distinct types of LLVM/OpenMP runtimes +There are four distinct types of LLVM/OpenMP runtimes LLVM/OpenMP Host Runtime (``libomp``) ------------------------------------- @@ -48,7 +48,10 @@ `Speedscope App`_. Building this feature depends on the `LLVM Support Library`_ for time trace output. Using this library is enabled by default when building using the CMake option ``OPENMP_ENABLE_LIBOMPTARGET_PROFILING``. The output will -be saved to the filename specified by the environment variable. +be saved to the filename specified by the environment variable. For multi-threaded +applications, profiling in ``libomp`` is also needed. Setting the CMake option +``OPENMP_ENABLE_LIBOMP_PROFILING=ON`` to enable the feature. Note that this will +turn ``libomp`` to a C++ library. .. _`Chrome Tracing`: https://www.chromium.org/developers/how-tos/trace-event-profiling-tool @@ -78,7 +81,7 @@ is enabled at any level of debugging so a full debug runtime is not required. For minimal debugging information compile with `-gline-tables-only`, or compile with `-g` for full debug information. A full list of flags supported by -``LIBOMPTARGET_INFO`` is given below. +``LIBOMPTARGET_INFO`` is given below. * Print all data arguments upon entering an OpenMP device kernel: ``0x01`` * Indicate when a mapped address already exists in the device mapping table: @@ -135,8 +138,8 @@ Info: Device supports up to 65536 CUDA blocks and 1024 threads with a warp size of 32 Info: Entering OpenMP data region at zaxpy.cpp:14:1 with 2 arguments: - Info: to(X[0:N])[16384] - Info: tofrom(Y[0:N])[16384] + Info: to(X[0:N])[16384] + Info: tofrom(Y[0:N])[16384] Info: OpenMP Host-Device pointer mappings after block at zaxpy.cpp:14:1: Info: Host Ptr Target Ptr Size (B) RefCount Declaration Info: 0x00007fff963f4000 0x00007fd225004000 16384 1 Y[0:N] at zaxpy.cpp:13:17 @@ -146,9 +149,9 @@ Info: use_address(Y)[0] (implicit) Info: tofrom(D)[16] (implicit) Info: use_address(X)[0] (implicit) - Info: Mapping exists (implicit) with HstPtrBegin=0x00007ffe37d8be80, + Info: Mapping exists (implicit) with HstPtrBegin=0x00007ffe37d8be80, TgtPtrBegin=0x00007f90ff004000, Size=0, updated RefCount=2, Name=Y - Info: Mapping exists (implicit) with HstPtrBegin=0x00007ffe37d8fe80, + Info: Mapping exists (implicit) with HstPtrBegin=0x00007ffe37d8fe80, TgtPtrBegin=0x00007f90ff000000, Size=0, updated RefCount=2, Name=X Info: Launching kernel __omp_offloading_fd02_c2c4ac1a__Z5daxpyPNSt3__17complexIdEES2_S1_m_l6 with 8 blocks and 128 threads in SPMD mode @@ -157,8 +160,8 @@ Info: 0x00007fff963f4000 0x00007fd225004000 16384 1 Y[0:N] at zaxpy.cpp:13:17 Info: 0x00007fff963f8000 0x00007fd225000000 16384 1 X[0:N] at zaxpy.cpp:13:11 Info: Exiting OpenMP data region at zaxpy.cpp:14:1 with 2 arguments: - Info: to(X[0:N])[16384] - Info: tofrom(Y[0:N])[16384] + Info: to(X[0:N])[16384] + Info: tofrom(Y[0:N])[16384] From this information, we can see the OpenMP kernel being launched on the CUDA device with enough threads and blocks for all ``1024`` iterations of the loop in @@ -197,10 +200,10 @@ #pragma omp target teams distribute parallel for reduction(+:sum) for (int i = 0; i < N; ++i) sum += A[i]; - + return sum; } - + int main() { const int N = 1024; double A[N]; @@ -218,7 +221,7 @@ .. code-block:: text CUDA error: Error when copying data from device to host. - CUDA error: an illegal memory access was encountered + CUDA error: an illegal memory access was encountered Libomptarget error: Copying data from device failed. Libomptarget error: Call to targetDataEnd failed, abort target. Libomptarget error: Failed to process data after launching the kernel. @@ -258,7 +261,7 @@ #pragma omp target teams distribute parallel for reduction(+:sum) map(to:A[0 : N]) for (int i = 0; i < N; ++i) sum += A[i]; - + return sum; } diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt --- a/openmp/runtime/CMakeLists.txt +++ b/openmp/runtime/CMakeLists.txt @@ -34,7 +34,6 @@ # Should assertions be enabled? They are on by default. set(LIBOMP_ENABLE_ASSERTIONS TRUE CACHE BOOL "enable assertions?") - set(LIBOMPTARGET_PROFILING_SUPPORT FALSE) else() # Part of LLVM build # Determine the native architecture from LLVM. string(TOLOWER "${LLVM_TARGET_ARCH}" LIBOMP_NATIVE_ARCH) @@ -66,10 +65,11 @@ libomp_get_architecture(LIBOMP_ARCH) endif () set(LIBOMP_ENABLE_ASSERTIONS ${LLVM_ENABLE_ASSERTIONS}) - # Time profiling support - set(LIBOMPTARGET_PROFILING_SUPPORT ${OPENMP_ENABLE_LIBOMPTARGET_PROFILING}) endif() +# Time profiling support +set(LIBOMP_PROFILING_SUPPORT ${OPENMP_ENABLE_LIBOMP_PROFILING}) + # FUJITSU A64FX is a special processor because its cache line size is 256. # We need to pass this information into kmp_config.h. if(LIBOMP_ARCH STREQUAL "aarch64") diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt --- a/openmp/runtime/src/CMakeLists.txt +++ b/openmp/runtime/src/CMakeLists.txt @@ -50,9 +50,8 @@ include_directories(${LIBOMP_HWLOC_INSTALL_DIR}/include) endif() -# Building with time profiling support for libomptarget requires -# LLVM directory includes. -if(LIBOMPTARGET_PROFILING_SUPPORT) +# Building with time profiling support requires LLVM directory includes. +if(LIBOMP_PROFILING_SUPPORT) include_directories( ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_INCLUDE_DIR} @@ -144,7 +143,7 @@ libomp_get_libflags(LIBOMP_CONFIGURED_LIBFLAGS) # Build libomp library. Add LLVMSupport dependency if building in-tree with libomptarget profiling enabled. -if(OPENMP_STANDALONE_BUILD OR (NOT OPENMP_ENABLE_LIBOMPTARGET_PROFILING)) +if(OPENMP_STANDALONE_BUILD OR (NOT OPENMP_ENABLE_LIBOMP_PROFILING)) add_library(omp ${LIBOMP_LIBRARY_KIND} ${LIBOMP_SOURCE_FILES}) # Linking command will include libraries in LIBOMP_CONFIGURED_LIBFLAGS target_link_libraries(omp ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS}) @@ -153,6 +152,8 @@ LINK_LIBS ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS} LINK_COMPONENTS Support ) + # libomp must be a C++ library such that it can link libLLVMSupport + set(LIBOMP_LINKER_LANGUAGE CXX) endif() set_target_properties(omp PROPERTIES diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake --- a/openmp/runtime/src/kmp_config.h.cmake +++ b/openmp/runtime/src/kmp_config.h.cmake @@ -44,8 +44,8 @@ #define OMPT_DEBUG LIBOMP_OMPT_DEBUG #cmakedefine01 LIBOMP_OMPT_SUPPORT #define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT -#cmakedefine01 LIBOMPTARGET_PROFILING_SUPPORT -#define OMPTARGET_PROFILING_SUPPORT LIBOMPTARGET_PROFILING_SUPPORT +#cmakedefine01 LIBOMP_PROFILING_SUPPORT +#define OMP_PROFILING_SUPPORT LIBOMP_PROFILING_SUPPORT #cmakedefine01 LIBOMP_OMPT_OPTIONAL #define OMPT_OPTIONAL LIBOMP_OMPT_OPTIONAL #cmakedefine01 LIBOMP_USE_ADAPTIVE_LOCKS diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -32,7 +32,7 @@ #include "ompt-specific.h" #endif -#if OMPTARGET_PROFILING_SUPPORT +#if OMP_PROFILING_SUPPORT #include "llvm/Support/TimeProfiler.h" static char *ProfileTraceFile = nullptr; #endif @@ -5740,7 +5740,7 @@ /* ------------------------------------------------------------------------ */ void *__kmp_launch_thread(kmp_info_t *this_thr) { -#if OMPTARGET_PROFILING_SUPPORT +#if OMP_PROFILING_SUPPORT ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); // TODO: add a configuration option for time granularity if (ProfileTraceFile) @@ -5848,7 +5848,7 @@ KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); KMP_MB(); -#if OMPTARGET_PROFILING_SUPPORT +#if OMP_PROFILING_SUPPORT llvm::timeTraceProfilerFinishThread(); #endif return this_thr;