diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -82,6 +82,8 @@ option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading." ${ENABLE_LIBOMPTARGET}) +option(OPENMP_ENABLE_LIBOMPTARGET_PROFILING "Enable time profiling for libomptarget." + ${ENABLE_LIBOMPTARGET}) if (OPENMP_ENABLE_LIBOMPTARGET) # Check that the library can actually be built. if (APPLE OR WIN32) diff --git a/openmp/libomptarget/src/CMakeLists.txt b/openmp/libomptarget/src/CMakeLists.txt --- a/openmp/libomptarget/src/CMakeLists.txt +++ b/openmp/libomptarget/src/CMakeLists.txt @@ -21,11 +21,23 @@ omptarget.cpp ) -# Build libomptarget library with libdl dependency. -add_library(omptarget SHARED ${src_files}) -target_link_libraries(omptarget - ${CMAKE_DL_LIBS} - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports") +# Build libomptarget library with libdl dependency. Add LLVMSupport +# dependency if building in-tree with profiling enabled. +if(OPENMP_STANDALONE_BUILD OR (NOT OPENMP_ENABLE_LIBOMPTARGET_PROFILING)) + add_library(omptarget SHARED ${src_files}) + target_link_libraries(omptarget + ${CMAKE_DL_LIBS} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports") +else() + set(LLVM_LINK_COMPONENTS + Support + ) + add_llvm_library(omptarget SHARED ${src_files} + LINK_LIBS ${CMAKE_DL_LIBS} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports" + ) + target_compile_definitions(omptarget PUBLIC OMPTARGET_PROFILE_ENABLED) +endif() # Install libomptarget under the lib destination folder. install(TARGETS omptarget LIBRARY COMPONENT omptarget diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -19,6 +19,7 @@ #include EXTERN int omp_get_num_devices(void) { + TIMESCOPE(); PM->RTLsMtx.lock(); size_t DevicesSize = PM->Devices.size(); PM->RTLsMtx.unlock(); @@ -29,12 +30,14 @@ } EXTERN int omp_get_initial_device(void) { + TIMESCOPE(); int hostDevice = omp_get_num_devices(); DP("Call to omp_get_initial_device returning %d\n", hostDevice); return hostDevice; } EXTERN void *omp_target_alloc(size_t size, int device_num) { + TIMESCOPE(); DP("Call to omp_target_alloc for device %d requesting %zu bytes\n", device_num, size); @@ -62,6 +65,7 @@ } EXTERN void omp_target_free(void *device_ptr, int device_num) { + TIMESCOPE(); DP("Call to omp_target_free for device %d and address " DPxMOD "\n", device_num, DPxPTR(device_ptr)); @@ -86,6 +90,7 @@ } EXTERN int omp_target_is_present(void *ptr, int device_num) { + TIMESCOPE(); DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n", device_num, DPxPTR(ptr)); @@ -125,6 +130,7 @@ EXTERN int omp_target_memcpy(void *dst, void *src, size_t length, size_t dst_offset, size_t src_offset, int dst_device, int src_device) { + TIMESCOPE(); DP("Call to omp_target_memcpy, dst device %d, src device %d, " "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst), @@ -190,6 +196,7 @@ int num_dims, const size_t *volume, const size_t *dst_offsets, const size_t *src_offsets, const size_t *dst_dimensions, const size_t *src_dimensions, int dst_device, int src_device) { + TIMESCOPE(); DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, " "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", " "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " @@ -244,6 +251,7 @@ EXTERN int omp_target_associate_ptr(void *host_ptr, void *device_ptr, size_t size, size_t device_offset, int device_num) { + TIMESCOPE(); DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", " "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n", DPxPTR(host_ptr), DPxPTR(device_ptr), size, device_offset, device_num); @@ -271,6 +279,7 @@ } EXTERN int omp_target_disassociate_ptr(void *host_ptr, int device_num) { + TIMESCOPE(); DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", " "device_num %d\n", DPxPTR(host_ptr), device_num); diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -81,18 +81,21 @@ //////////////////////////////////////////////////////////////////////////////// /// adds requires flags EXTERN void __tgt_register_requires(int64_t flags) { + TIMESCOPE(); PM->RTLs.RegisterRequires(flags); } //////////////////////////////////////////////////////////////////////////////// /// adds a target shared library to the target execution image EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) { + TIMESCOPE(); PM->RTLs.RegisterLib(desc); } //////////////////////////////////////////////////////////////////////////////// /// unloads a target shared library EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) { + TIMESCOPE(); PM->RTLs.UnregisterLib(desc); } @@ -101,6 +104,7 @@ /// and passes the data to the device. EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + TIMESCOPE(); __tgt_target_data_begin_mapper(nullptr, device_id, arg_num, args_base, args, arg_sizes, arg_types, nullptr, nullptr); } @@ -109,6 +113,7 @@ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + TIMESCOPE(); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); @@ -122,6 +127,7 @@ int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers) { + TIMESCOPE(); if (IsOffloadDisabled()) return; DP("Entering data begin region for device %" PRId64 " with %d mappings\n", @@ -160,6 +166,7 @@ void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + TIMESCOPE(); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc)); @@ -172,6 +179,7 @@ /// created by the last __tgt_target_data_begin. EXTERN void __tgt_target_data_end(int64_t device_id, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + TIMESCOPE(); __tgt_target_data_end_mapper(nullptr, device_id, arg_num, args_base, args, arg_sizes, arg_types, nullptr, nullptr); } @@ -180,6 +188,7 @@ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + TIMESCOPE(); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); @@ -193,6 +202,7 @@ int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers) { + TIMESCOPE(); if (IsOffloadDisabled()) return; DP("Entering data end region with %d mappings\n", arg_num); @@ -236,6 +246,7 @@ void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + TIMESCOPE(); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc)); @@ -245,6 +256,7 @@ EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + TIMESCOPE(); __tgt_target_data_update_mapper(nullptr, device_id, arg_num, args_base, args, arg_sizes, arg_types, nullptr, nullptr); } @@ -253,6 +265,7 @@ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + TIMESCOPE(); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); @@ -266,6 +279,7 @@ int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers) { + TIMESCOPE(); if (IsOffloadDisabled()) return; DP("Entering data update with %d mappings\n", arg_num); @@ -291,6 +305,7 @@ void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + TIMESCOPE(); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc)); @@ -300,6 +315,7 @@ EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + TIMESCOPE(); return __tgt_target_mapper(nullptr, device_id, host_ptr, arg_num, args_base, args, arg_sizes, arg_types, nullptr, nullptr); } @@ -308,6 +324,7 @@ int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + TIMESCOPE(); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); @@ -319,6 +336,7 @@ int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers) { + TIMESCOPE(); if (IsOffloadDisabled()) return OFFLOAD_FAIL; DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 "\n", DPxPTR(host_ptr), device_id); @@ -353,6 +371,7 @@ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + TIMESCOPE(); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc)); @@ -363,6 +382,7 @@ EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, int32_t team_num, int32_t thread_limit) { + TIMESCOPE(); return __tgt_target_teams_mapper(nullptr, device_id, host_ptr, arg_num, args_base, args, arg_sizes, arg_types, nullptr, nullptr, team_num, thread_limit); @@ -372,6 +392,7 @@ int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + TIMESCOPE(); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); @@ -387,6 +408,7 @@ map_var_info_t *arg_names, void **arg_mappers, int32_t team_num, int32_t thread_limit) { + TIMESCOPE(); if (IsOffloadDisabled()) return OFFLOAD_FAIL; DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 "\n", DPxPTR(host_ptr), device_id); @@ -424,6 +446,7 @@ map_var_info_t *arg_names, void **arg_mappers, int32_t team_num, int32_t thread_limit, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + TIMESCOPE(); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc)); @@ -434,6 +457,7 @@ // Get the current number of components for a user-defined mapper. EXTERN int64_t __tgt_mapper_num_components(void *rt_mapper_handle) { + TIMESCOPE(); auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle; int64_t size = MapperComponentsPtr->Components.size(); DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n", @@ -445,6 +469,7 @@ EXTERN void __tgt_push_mapper_component(void *rt_mapper_handle, void *base, void *begin, int64_t size, int64_t type) { + TIMESCOPE(); DP("__tgt_push_mapper_component(Handle=" DPxMOD ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 ", Type=0x%" PRIx64 ").\n", @@ -456,6 +481,7 @@ EXTERN void __kmpc_push_target_tripcount(ident_t *loc, int64_t device_id, uint64_t loop_tripcount) { + TIMESCOPE(); if (IsOffloadDisabled()) return; diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -107,4 +107,11 @@ } } +#ifdef OMPTARGET_PROFILE_ENABLED +#include "llvm/Support/TimeProfiler.h" +#define TIMESCOPE() llvm::TimeTraceScope TimeScope(__FUNCTION__) +#else +#define TIMESCOPE() +#endif + #endif diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -33,14 +33,35 @@ PluginManager *PM; +#if OMPTARGET_PROFILE_ENABLED +static char *ProfileTraceFile = nullptr; +#endif + __attribute__((constructor(101))) void init() { DP("Init target library!\n"); PM = new PluginManager(); + +#ifdef OMPTARGET_PROFILE_ENABLED + ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); + // TODO: add a configuration option for time granularity + if (ProfileTraceFile) + llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); +#endif } __attribute__((destructor(101))) void deinit() { DP("Deinit target library!\n"); delete PM; + +#ifdef OMPTARGET_PROFILE_ENABLED + if (ProfileTraceFile) { + // TODO: add env var for file output + if (auto E = llvm::timeTraceProfilerWrite(ProfileTraceFile, "-")) + fprintf(stderr, "Error writing out the time trace\n"); + + llvm::timeTraceProfilerCleanup(); + } +#endif } void RTLsTy::LoadRTLs() {