Index: llvm/include/llvm/Support/TimeProfiler.h =================================================================== --- llvm/include/llvm/Support/TimeProfiler.h +++ llvm/include/llvm/Support/TimeProfiler.h @@ -9,14 +9,61 @@ #ifndef LLVM_SUPPORT_TIMEPROFILER_H #define LLVM_SUPPORT_TIMEPROFILER_H -#include "llvm/Support/Error.h" #include "llvm/ADT/STLFunctionalExtras.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/Process.h" +#include +#include + +typedef std::chrono::duration + DurationType; +typedef std::chrono::time_point TimePointType; +typedef std::pair CountAndDurationType; +typedef std::pair + NameAndCountAndDurationType; namespace llvm { class raw_pwrite_stream; -struct TimeTraceProfiler; +struct Entry { + const TimePointType Start; + TimePointType End; + const std::string Name; + const std::string Detail; + + Entry(TimePointType &&S, TimePointType &&E, std::string &&N, + std::string &&Dt); + + // Calculate timings for FlameGraph. + std::chrono::system_clock::rep + getFlameGraphStartUs(TimePointType StartTime) const; + std::chrono::system_clock::rep getFlameGraphDurUs() const; +}; +struct TimeTraceProfiler { + TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = ""); + void begin(std::string Name, llvm::function_ref Detail); + void end(); + + // Write events from this TimeTraceProfilerInstance and + // ThreadTimeTraceProfilerInstances. + void write(raw_pwrite_stream &OS); + + SmallVector Stack; + SmallVector Entries; + StringMap CountAndTotalPerName; + const std::chrono::time_point BeginningOfTime; + const TimePointType StartTime; + const std::string ProcName; + const sys::Process::Pid Pid; + SmallString<0> ThreadName; + const uint64_t Tid; + + // Minimum time granularity (in microseconds) + const unsigned TimeTraceGranularity; +}; TimeTraceProfiler *getTimeTraceProfilerInstance(); /// Initialize the time trace profiler. @@ -90,6 +137,13 @@ } }; +struct TimeTraceProfilerInstances { + std::mutex Lock; + std::vector List; +}; +static TimeTraceProfilerInstances Instances; +TimeTraceProfilerInstances &getTimeTraceProfilerInstances(); + } // end namespace llvm #endif Index: llvm/lib/Support/TimeProfiler.cpp =================================================================== --- llvm/lib/Support/TimeProfiler.cpp +++ llvm/lib/Support/TimeProfiler.cpp @@ -15,11 +15,9 @@ #include "llvm/ADT/StringMap.h" #include "llvm/Support/JSON.h" #include "llvm/Support/Path.h" -#include "llvm/Support/Process.h" #include "llvm/Support/Threading.h" #include #include -#include #include #include #include @@ -27,20 +25,6 @@ using namespace std::chrono; using namespace llvm; -namespace { - -struct TimeTraceProfilerInstances { - std::mutex Lock; - std::vector List; -}; - -TimeTraceProfilerInstances &getTimeTraceProfilerInstances() { - static TimeTraceProfilerInstances Instances; - return Instances; -} - -} // anonymous namespace - // Per Thread instance static LLVM_THREAD_LOCAL TimeTraceProfiler *TimeTraceProfilerInstance = nullptr; @@ -48,226 +32,203 @@ return TimeTraceProfilerInstance; } -typedef duration DurationType; -typedef time_point TimePointType; -typedef std::pair CountAndDurationType; -typedef std::pair - NameAndCountAndDurationType; - -namespace { -struct Entry { - const TimePointType Start; - TimePointType End; - const std::string Name; - const std::string Detail; - - Entry(TimePointType &&S, TimePointType &&E, std::string &&N, std::string &&Dt) - : Start(std::move(S)), End(std::move(E)), Name(std::move(N)), - Detail(std::move(Dt)) {} - - // Calculate timings for FlameGraph. Cast time points to microsecond precision - // rather than casting duration. This avoid truncation issues causing inner - // scopes overruning outer scopes. - steady_clock::rep getFlameGraphStartUs(TimePointType StartTime) const { - return (time_point_cast(Start) - - time_point_cast(StartTime)) - .count(); - } +namespace llvm { +Entry::Entry(TimePointType &&S, TimePointType &&E, std::string &&N, + std::string &&Dt) + : Start(std::move(S)), End(std::move(E)), Name(std::move(N)), + Detail(std::move(Dt)) {} + +// Calculate timings for FlameGraph. Cast time points to microsecond precision +// rather than casting duration. This avoid truncation issues causing inner +// scopes overruning outer scopes. +system_clock::rep Entry::getFlameGraphStartUs(TimePointType StartTime) const { + return (time_point_cast(Start) - + time_point_cast(StartTime)) + .count(); +} - steady_clock::rep getFlameGraphDurUs() const { - return (time_point_cast(End) - - time_point_cast(Start)) - .count(); - } -}; -} // namespace - -struct llvm::TimeTraceProfiler { - TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "") - : BeginningOfTime(system_clock::now()), StartTime(steady_clock::now()), - ProcName(ProcName), Pid(sys::Process::getProcessId()), - Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity) { - llvm::get_thread_name(ThreadName); - } +system_clock::rep Entry::getFlameGraphDurUs() const { + return (time_point_cast(End) - + time_point_cast(Start)) + .count(); +} - void begin(std::string Name, llvm::function_ref Detail) { - Stack.emplace_back(steady_clock::now(), TimePointType(), std::move(Name), - Detail()); - } +TimeTraceProfiler::TimeTraceProfiler(unsigned TimeTraceGranularity, + StringRef ProcName) + : BeginningOfTime(system_clock::now()), StartTime(system_clock::now()), + ProcName(ProcName), Pid(sys::Process::getProcessId()), + Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity) { + llvm::get_thread_name(ThreadName); +} + +void TimeTraceProfiler::begin(std::string Name, + llvm::function_ref Detail) { + Stack.emplace_back(system_clock::now(), TimePointType(), std::move(Name), + Detail()); +} - void end() { - assert(!Stack.empty() && "Must call begin() first"); - Entry &E = Stack.back(); - E.End = steady_clock::now(); - - // Check that end times monotonically increase. - assert((Entries.empty() || - (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >= - Entries.back().getFlameGraphStartUs(StartTime) + - Entries.back().getFlameGraphDurUs())) && - "TimeProfiler scope ended earlier than previous scope"); - - // Calculate duration at full precision for overall counts. - DurationType Duration = E.End - E.Start; - - // Only include sections longer or equal to TimeTraceGranularity msec. - if (duration_cast(Duration).count() >= TimeTraceGranularity) - Entries.emplace_back(E); - - // Track total time taken by each "name", but only the topmost levels of - // them; e.g. if there's a template instantiation that instantiates other - // templates from within, we only want to add the topmost one. "topmost" - // happens to be the ones that don't have any currently open entries above - // itself. - if (llvm::none_of(llvm::drop_begin(llvm::reverse(Stack)), - [&](const Entry &Val) { return Val.Name == E.Name; })) { - auto &CountAndTotal = CountAndTotalPerName[E.Name]; - CountAndTotal.first++; - CountAndTotal.second += Duration; - } - - Stack.pop_back(); +void TimeTraceProfiler::end() { + assert(!Stack.empty() && "Must call begin() first"); + Entry &E = Stack.back(); + E.End = system_clock::now(); + + // Check that end times monotonically increase. + assert((Entries.empty() || + (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >= + Entries.back().getFlameGraphStartUs(StartTime) + + Entries.back().getFlameGraphDurUs())) && + "TimeProfiler scope ended earlier than previous scope"); + + // Calculate duration at full precision for overall counts. + DurationType Duration = E.End - E.Start; + + // Only include sections longer or equal to TimeTraceGranularity msec. + if (duration_cast(Duration).count() >= TimeTraceGranularity) + Entries.emplace_back(E); + + // Track total time taken by each "name", but only the topmost levels of + // them; e.g. if there's a template instantiation that instantiates other + // templates from within, we only want to add the topmost one. "topmost" + // happens to be the ones that don't have any currently open entries above + // itself. + if (llvm::none_of(llvm::drop_begin(llvm::reverse(Stack)), + [&](const Entry &Val) { return Val.Name == E.Name; })) { + auto &CountAndTotal = CountAndTotalPerName[E.Name]; + CountAndTotal.first++; + CountAndTotal.second += Duration; } - // Write events from this TimeTraceProfilerInstance and - // ThreadTimeTraceProfilerInstances. - void write(raw_pwrite_stream &OS) { - // Acquire Mutex as reading ThreadTimeTraceProfilerInstances. - auto &Instances = getTimeTraceProfilerInstances(); - std::lock_guard Lock(Instances.Lock); - assert(Stack.empty() && - "All profiler sections should be ended when calling write"); - assert(llvm::all_of(Instances.List, - [](const auto &TTP) { return TTP->Stack.empty(); }) && - "All profiler sections should be ended when calling write"); - - json::OStream J(OS); - J.objectBegin(); - J.attributeBegin("traceEvents"); - J.arrayBegin(); - - // Emit all events for the main flame graph. - auto writeEvent = [&](const auto &E, uint64_t Tid) { - auto StartUs = E.getFlameGraphStartUs(StartTime); - auto DurUs = E.getFlameGraphDurUs(); - - J.object([&] { - J.attribute("pid", Pid); - J.attribute("tid", int64_t(Tid)); - J.attribute("ph", "X"); - J.attribute("ts", StartUs); - J.attribute("dur", DurUs); - J.attribute("name", E.Name); - if (!E.Detail.empty()) { - J.attributeObject("args", [&] { J.attribute("detail", E.Detail); }); - } - }); - }; - for (const Entry &E : Entries) - writeEvent(E, this->Tid); - for (const TimeTraceProfiler *TTP : Instances.List) - for (const Entry &E : TTP->Entries) - writeEvent(E, TTP->Tid); - - // Emit totals by section name as additional "thread" events, sorted from - // longest one. - // Find highest used thread id. - uint64_t MaxTid = this->Tid; - for (const TimeTraceProfiler *TTP : Instances.List) - MaxTid = std::max(MaxTid, TTP->Tid); - - // Combine all CountAndTotalPerName from threads into one. - StringMap AllCountAndTotalPerName; - auto combineStat = [&](const auto &Stat) { - StringRef Key = Stat.getKey(); - auto Value = Stat.getValue(); - auto &CountAndTotal = AllCountAndTotalPerName[Key]; - CountAndTotal.first += Value.first; - CountAndTotal.second += Value.second; - }; - for (const auto &Stat : CountAndTotalPerName) - combineStat(Stat); - for (const TimeTraceProfiler *TTP : Instances.List) - for (const auto &Stat : TTP->CountAndTotalPerName) - combineStat(Stat); - - std::vector SortedTotals; - SortedTotals.reserve(AllCountAndTotalPerName.size()); - for (const auto &Total : AllCountAndTotalPerName) - SortedTotals.emplace_back(std::string(Total.getKey()), Total.getValue()); - - llvm::sort(SortedTotals, [](const NameAndCountAndDurationType &A, - const NameAndCountAndDurationType &B) { - return A.second.second > B.second.second; + Stack.pop_back(); +} + +// Write events from this TimeTraceProfilerInstance and +// ThreadTimeTraceProfilerInstances. +void TimeTraceProfiler::write(raw_pwrite_stream &OS) { + // Acquire Mutex as reading ThreadTimeTraceProfilerInstances. + auto &Instances = getTimeTraceProfilerInstances(); + std::lock_guard Lock(Instances.Lock); + assert(Stack.empty() && + "All profiler sections should be ended when calling write"); + assert(llvm::all_of(Instances.List, + [](const auto &TTP) { return TTP->Stack.empty(); }) && + "All profiler sections should be ended when calling write"); + + json::OStream J(OS); + J.objectBegin(); + J.attributeBegin("traceEvents"); + J.arrayBegin(); + + // Emit all events for the main flame graph. + auto writeEvent = [&](const auto &E, uint64_t Tid) { + auto StartUs = E.getFlameGraphStartUs(StartTime); + auto DurUs = E.getFlameGraphDurUs(); + + J.object([&] { + J.attribute("pid", Pid); + J.attribute("tid", int64_t(Tid)); + J.attribute("ph", "X"); + J.attribute("ts", StartUs); + J.attribute("dur", DurUs); + J.attribute("name", E.Name); + if (!E.Detail.empty()) { + J.attributeObject("args", [&] { J.attribute("detail", E.Detail); }); + } }); + }; + for (const Entry &E : Entries) + writeEvent(E, this->Tid); + for (const TimeTraceProfiler *TTP : Instances.List) + for (const Entry &E : TTP->Entries) + writeEvent(E, TTP->Tid); + + // Emit totals by section name as additional "thread" events, sorted from + // longest one. + // Find highest used thread id. + uint64_t MaxTid = this->Tid; + for (const TimeTraceProfiler *TTP : Instances.List) + MaxTid = std::max(MaxTid, TTP->Tid); + + // Combine all CountAndTotalPerName from threads into one. + StringMap AllCountAndTotalPerName; + auto combineStat = [&](const auto &Stat) { + StringRef Key = Stat.getKey(); + auto Value = Stat.getValue(); + auto &CountAndTotal = AllCountAndTotalPerName[Key]; + CountAndTotal.first += Value.first; + CountAndTotal.second += Value.second; + }; + for (const auto &Stat : CountAndTotalPerName) + combineStat(Stat); + for (const TimeTraceProfiler *TTP : Instances.List) + for (const auto &Stat : TTP->CountAndTotalPerName) + combineStat(Stat); - // Report totals on separate threads of tracing file. - uint64_t TotalTid = MaxTid + 1; - for (const NameAndCountAndDurationType &Total : SortedTotals) { - auto DurUs = duration_cast(Total.second.second).count(); - auto Count = AllCountAndTotalPerName[Total.first].first; - - J.object([&] { - J.attribute("pid", Pid); - J.attribute("tid", int64_t(TotalTid)); - J.attribute("ph", "X"); - J.attribute("ts", 0); - J.attribute("dur", DurUs); - J.attribute("name", "Total " + Total.first); - J.attributeObject("args", [&] { - J.attribute("count", int64_t(Count)); - J.attribute("avg ms", int64_t(DurUs / Count / 1000)); - }); + std::vector SortedTotals; + SortedTotals.reserve(AllCountAndTotalPerName.size()); + for (const auto &Total : AllCountAndTotalPerName) + SortedTotals.emplace_back(std::string(Total.getKey()), Total.getValue()); + + llvm::sort(SortedTotals, [](const NameAndCountAndDurationType &A, + const NameAndCountAndDurationType &B) { + return A.second.second > B.second.second; + }); + + // Report totals on separate threads of tracing file. + uint64_t TotalTid = MaxTid + 1; + for (const NameAndCountAndDurationType &Total : SortedTotals) { + auto DurUs = duration_cast(Total.second.second).count(); + auto Count = AllCountAndTotalPerName[Total.first].first; + + J.object([&] { + J.attribute("pid", Pid); + J.attribute("tid", int64_t(TotalTid)); + J.attribute("ph", "X"); + J.attribute("ts", 0); + J.attribute("dur", DurUs); + J.attribute("name", "Total " + Total.first); + J.attributeObject("args", [&] { + J.attribute("count", int64_t(Count)); + J.attribute("avg ms", int64_t(DurUs / Count / 1000)); }); + }); - ++TotalTid; - } - - auto writeMetadataEvent = [&](const char *Name, uint64_t Tid, - StringRef arg) { - J.object([&] { - J.attribute("cat", ""); - J.attribute("pid", Pid); - J.attribute("tid", int64_t(Tid)); - J.attribute("ts", 0); - J.attribute("ph", "M"); - J.attribute("name", Name); - J.attributeObject("args", [&] { J.attribute("name", arg); }); - }); - }; + ++TotalTid; + } + + auto writeMetadataEvent = [&](const char *Name, uint64_t Tid, StringRef arg) { + J.object([&] { + J.attribute("cat", ""); + J.attribute("pid", Pid); + J.attribute("tid", int64_t(Tid)); + J.attribute("ts", 0); + J.attribute("ph", "M"); + J.attribute("name", Name); + J.attributeObject("args", [&] { J.attribute("name", arg); }); + }); + }; - writeMetadataEvent("process_name", Tid, ProcName); - writeMetadataEvent("thread_name", Tid, ThreadName); - for (const TimeTraceProfiler *TTP : Instances.List) - writeMetadataEvent("thread_name", TTP->Tid, TTP->ThreadName); + writeMetadataEvent("process_name", Tid, ProcName); + writeMetadataEvent("thread_name", Tid, ThreadName); + for (const TimeTraceProfiler *TTP : Instances.List) + writeMetadataEvent("thread_name", TTP->Tid, TTP->ThreadName); - J.arrayEnd(); - J.attributeEnd(); + J.arrayEnd(); + J.attributeEnd(); - // Emit the absolute time when this TimeProfiler started. - // This can be used to combine the profiling data from - // multiple processes and preserve actual time intervals. - J.attribute("beginningOfTime", - time_point_cast(BeginningOfTime) - .time_since_epoch() - .count()); + // Emit the absolute time when this TimeProfiler started. + // This can be used to combine the profiling data from + // multiple processes and preserve actual time intervals. + J.attribute("beginningOfTime", time_point_cast(BeginningOfTime) + .time_since_epoch() + .count()); - J.objectEnd(); - } + J.objectEnd(); +} - SmallVector Stack; - SmallVector Entries; - StringMap CountAndTotalPerName; - const time_point BeginningOfTime; - const TimePointType StartTime; - const std::string ProcName; - const sys::Process::Pid Pid; - SmallString<0> ThreadName; - const uint64_t Tid; - - // Minimum time granularity (in microseconds) - const unsigned TimeTraceGranularity; -}; +TimeTraceProfilerInstances &getTimeTraceProfilerInstances() { + return Instances; +} +} // namespace llvm void llvm::timeTraceProfilerInitialize(unsigned TimeTraceGranularity, StringRef ProcName) { Index: llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll =================================================================== --- llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll +++ llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll @@ -25,6 +25,7 @@ @__omp_rtl_debug_kind = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0 @__omp_rtl_assume_no_thread_state = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0 @omptarget_device_environment = weak protected addrspace(4) global %struct.DeviceEnvironmentTy undef, align 4 +@omptarget_device_time_profile = weak protected addrspace(4) global %struct.DeviceTimeProfileTy undef, align 4 @IsSPMDMode = weak hidden addrspace(3) global i32 undef, align 4 @.str.12 = private unnamed_addr addrspace(4) constant [47 x i8] c"ValueRAII initialization with wrong old value!\00", align 1 @_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16 @@ -34,6 +35,7 @@ @_ZL29SharedMemVariableSharingSpace = internal unnamed_addr addrspace(3) global [64 x ptr] undef, align 16 @G = global i32 undef @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(3) @IsSPMDMode to ptr), ptr addrspacecast (ptr addrspace(4) @omptarget_device_environment to ptr)], section "llvm.metadata" +@llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(3) @IsSPMDMode to ptr), ptr addrspacecast (ptr addrspace(4) @omptarget_device_time_profile to ptr)], section "llvm.metadata" @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__omp_offloading_20_11e3950_main_l12_exec_mode to ptr)], section "llvm.metadata" ; Function Attrs: alwaysinline convergent norecurse nounwind Index: openmp/libomptarget/DeviceRTL/include/Configuration.h =================================================================== --- openmp/libomptarget/DeviceRTL/include/Configuration.h +++ openmp/libomptarget/DeviceRTL/include/Configuration.h @@ -13,9 +13,19 @@ #ifndef OMPTARGET_CONFIGURATION_H #define OMPTARGET_CONFIGURATION_H +#include "DeviceTimeProfile.h" #include "Types.h" +#include "targetprofile.h" namespace _OMP { + +namespace targetprof { + +int getNumRecords(); +_OMP::prof::Record *getRecords(); + +} // namespace targetprof + namespace config { enum DebugKind : uint32_t { Index: openmp/libomptarget/DeviceRTL/include/Interface.h =================================================================== --- openmp/libomptarget/DeviceRTL/include/Interface.h +++ openmp/libomptarget/DeviceRTL/include/Interface.h @@ -132,6 +132,8 @@ int omp_get_team_num(); +int omp_get_num_timeprofile_records(); + int omp_get_initial_device(void); void *llvm_omp_target_dynamic_shared_alloc(); Index: openmp/libomptarget/DeviceRTL/src/Configuration.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Configuration.cpp +++ openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -13,6 +13,7 @@ #include "Configuration.h" #include "DeviceEnvironment.h" +#include "DeviceTimeProfile.h" #include "State.h" #include "Types.h" @@ -31,6 +32,15 @@ DeviceEnvironmentTy CONSTANT(omptarget_device_environment) __attribute__((used, retain, weak, visibility("protected"))); +DeviceTimeProfileTy omptarget_device_time_profile + __attribute__((used, retain, weak, visibility("protected"))); + +int targetprof::getNumRecords() { return omptarget_device_time_profile.StartC; } + +_OMP::prof::Record *targetprof::getRecords() { + return omptarget_device_time_profile.Records; +} + uint32_t config::getDebugKind() { return __omp_rtl_debug_kind & omptarget_device_environment.DebugKind; } Index: openmp/libomptarget/DeviceRTL/src/Kernel.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "Debug.h" +#include "DeviceTimeProfile.h" #include "Interface.h" #include "Mapping.h" #include "State.h" @@ -21,6 +22,44 @@ #pragma omp begin declare target device_type(nohost) +// CLOCKINIT(); +extern DeviceTimeProfileTy omptarget_device_time_profile; + +namespace _OMP { +namespace prof { + +__attribute__((used)) int clockIn(const char *FuncName, int Idx) { + if (Idx < OMP_ARRAY_SIZE) { +#pragma omp parallel + { + int thread_ = omp_get_thread_num(); + if (thread_ < OMP_THREADS) { + omptarget_device_time_profile.Records[Idx].Start[thread_] = + omp_get_wtime(); + omptarget_device_time_profile.Records[Idx].Thread[thread_] = thread_; + } + } + return Idx; + } else { + return OMP_ARRAY_SIZE; + } +} + +__attribute__((used)) void clockOut(const char *FuncName, int Idx) { + if (Idx < OMP_ARRAY_SIZE) { +#pragma omp parallel + { + int thread_ = omp_get_thread_num(); + if (thread_ < OMP_THREADS) { + omptarget_device_time_profile.Records[Idx].Stop[thread_] = + omp_get_wtime(); + } + } + } +} +} // namespace prof +} // namespace _OMP + static void inititializeRuntime(bool IsSPMD) { // Order is important here. synchronize::init(IsSPMD); @@ -78,6 +117,7 @@ // code and workers will run into a barrier right away. } + CLOCKIN(); if (IsSPMD) { state::assumeInitialState(IsSPMD); return -1; @@ -127,6 +167,7 @@ /// void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode, bool) { FunctionTracingRAII(); + CLOCKOUT(); const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD; state::assumeInitialState(IsSPMD); if (IsSPMD) Index: openmp/libomptarget/DeviceRTL/src/State.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/State.cpp +++ openmp/libomptarget/DeviceRTL/src/State.cpp @@ -389,6 +389,14 @@ int omp_get_team_num() { return mapping::getBlockId(); } +int omp_get_num_target_timeprofile_records() { + return targetprof::getNumRecords(); +} + +_OMP::prof::Record *omp_get_target_timeprofile_records() { + return targetprof::getRecords(); +} + int omp_get_initial_device(void) { return -1; } } Index: openmp/libomptarget/include/DeviceTimeProfile.h =================================================================== --- /dev/null +++ openmp/libomptarget/include/DeviceTimeProfile.h @@ -0,0 +1,49 @@ +//=-- libomptarget/DeviceTimeProfile.h - DeviceTimeProfile def “-*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declaration of the DeviceTimeProfile class which is +/// the base class for the Device Time data collected on target +/// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_DEVICETIMEPROFILE_H +#define _OMPTARGET_DEVICETIMEPROFILE_H + +#include "targetprofile.h" + +#define CLOCKDEC() DeviceTimeProfileTy omptarget_device_time_profile; + +#define CLOCKINIT() omptarget_device_time_profile.StartC = 0; + +#define CLOCKSYNC() clockSync(&Device); + +#define CLOCKFINALIZE() clockFinalize(this); + +#define CLOCKIN() \ + if (omptarget_device_time_profile.StartC < OMP_ARRAY_SIZE) { \ + omptarget_device_time_profile.Idx = \ + _OMP::prof::clockIn(__func__, omptarget_device_time_profile.StartC++); \ + }; + +#define CLOCKOUT() \ + if (omptarget_device_time_profile.Idx < OMP_ARRAY_SIZE) { \ + _OMP::prof::clockOut(__func__, omptarget_device_time_profile.Idx); \ + } + +struct alignas(32) DeviceTimeProfileTy { + int DeviceId; + int StartC; + int Idx; + _OMP::prof::Record Records[OMP_ARRAY_SIZE]; +}; + +#pragma omp begin declare target +extern DeviceTimeProfileTy omptarget_device_time_profile; +#pragma omp end declare target +#endif Index: openmp/libomptarget/include/device.h =================================================================== --- openmp/libomptarget/include/device.h +++ openmp/libomptarget/include/device.h @@ -350,6 +350,8 @@ // Return true if data can be copied to DstDevice directly bool isDataExchangable(const DeviceTy &DstDevice); + int32_t getTimeProfilePtr(void **ptr); + /// Lookup the mapping of \p HstPtrBegin in \p HDTTMap. The accessor ensures /// exclusive access to the HDTT map. LookupResult lookupMapping(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, Index: openmp/libomptarget/include/omptargetplugin.h =================================================================== --- openmp/libomptarget/include/omptargetplugin.h +++ openmp/libomptarget/include/omptargetplugin.h @@ -190,6 +190,7 @@ int32_t __tgt_rtl_init_async_info(int32_t ID, __tgt_async_info **AsyncInfoPtr); int32_t __tgt_rtl_init_device_info(int32_t ID, __tgt_device_info *DeviceInfoPtr, const char **ErrStr); +int32_t __tgt_rtl_get_timeprofile_data(int32_t DeviceId, void **DataPtr); #ifdef __cplusplus } Index: openmp/libomptarget/include/rtl.h =================================================================== --- openmp/libomptarget/include/rtl.h +++ openmp/libomptarget/include/rtl.h @@ -70,6 +70,7 @@ typedef int32_t(record_event_ty)(int32_t, void *, __tgt_async_info *); typedef int32_t(wait_event_ty)(int32_t, void *, __tgt_async_info *); typedef int32_t(sync_event_ty)(int32_t, void *); + typedef int32_t(time_profile_ptr_ty)(int32_t, void **); typedef int32_t(destroy_event_ty)(int32_t, void *); typedef int32_t(release_async_info_ty)(int32_t, __tgt_async_info *); typedef int32_t(init_async_info_ty)(int32_t, __tgt_async_info **); @@ -117,6 +118,7 @@ supports_empty_images_ty *supports_empty_images = nullptr; set_info_flag_ty *set_info_flag = nullptr; print_device_info_ty *print_device_info = nullptr; + time_profile_ptr_ty *time_profile_ptr = nullptr; create_event_ty *create_event = nullptr; record_event_ty *record_event = nullptr; wait_event_ty *wait_event = nullptr; Index: openmp/libomptarget/include/targetprofile.h =================================================================== --- /dev/null +++ openmp/libomptarget/include/targetprofile.h @@ -0,0 +1,50 @@ +//==- libomptarget/targetprofile.h - Target profile header file --*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declaration of the Structs needed for target +/// profiling. It provides the base data structures to store the data +/// collected. +/// +//===----------------------------------------------------------------------===// + +#ifndef TARGETPROFILE_H +#define TARGETPROFILE_H + +#define OMP_THREADS 5 +#define OMP_EVENT_SIZE 64 +#define OMP_ARRAY_SIZE 10240 + +namespace _OMP { +namespace prof { +struct alignas(32) Record { + int Thread[OMP_THREADS]; + double Start[OMP_THREADS]; + double Stop[OMP_THREADS]; + char Event[OMP_EVENT_SIZE][OMP_THREADS]; +}; + +struct ThreadRecord { + int Thread; + double Start; + double Stop; + char Event[OMP_EVENT_SIZE]; +}; +#pragma omp begin declare target +extern Record Records[OMP_ARRAY_SIZE]; +extern int StartC; +extern int Idx; +int clockIn(const char *FuncName, int Idx); +void clockOut(const char *FuncName, int Idx); +// void clockFinalize(DeviceTy*); +#pragma omp end declare target +} // namespace prof +} // namespace _OMP + +#endif Index: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -38,6 +38,7 @@ #include "rt.h" #include "DeviceEnvironment.h" +#include "DeviceTimeProfile.h" #include "get_elf_mach_gfx_name.h" #include "omptargetplugin.h" #include "print_tracing.h" @@ -1786,6 +1787,109 @@ } }; +struct DeviceTimeProfile { + // initialise an DeviceTimeProfileTy in the deviceRTL + // patches around differences in the deviceRTL between trunk, aomp, + // rocmcc. Over time these differences will tend to zero and this class + // simplified. + // Symbol may be in .data or .bss, and may be missing fields, todo: + // review aomp/trunk/rocm and simplify the following + + // The symbol may also have been deadstripped because the device side + // accessors were unused. + + // If the symbol is in .data (aomp, rocm) it can be written directly. + // If it is in .bss, we must wait for it to be allocated space on the + // gpu (trunk) and initialize after loading. + const char *sym() { return "omptarget_device_time_profile"; } + + DeviceTimeProfileTy HostDeviceTimeProfile; + SymbolInfo SI; + bool Valid = false; + + __tgt_device_image *Image; + const size_t ImgSize; + + DeviceTimeProfile(int DeviceId, _OMP::prof::Record records_[OMP_ARRAY_SIZE], + int startc_, int idx_, __tgt_device_image *Image, + const size_t ImgSize) + : Image(Image), ImgSize(ImgSize) { + + HostDeviceTimeProfile.StartC = startc_; + HostDeviceTimeProfile.Idx = idx_; + HostDeviceTimeProfile.DeviceId = DeviceId; + for (int i = 0; i < startc_; i++) { + for (int j = 0; j < OMP_THREADS; j++) { + HostDeviceTimeProfile.Records[i].Thread[j] = records_[i].Thread[j]; + HostDeviceTimeProfile.Records[i].Stop[j] = records_[i].Stop[j]; + HostDeviceTimeProfile.Records[i].Start[j] = records_[i].Start[j]; + } + } + /* + if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) + HostDeviceTimeProfile.DebugKind = std::stoi(EnvStr); + */ + int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize, + sym(), &SI); + if (Rc != 0) { + DP("Finding global device time profile '%s' - symbol missing.\n", sym()); + return; + } + + if (SI.Size > sizeof(HostDeviceTimeProfile)) { + DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size, + sizeof(HostDeviceTimeProfile)); + return; + } + + Valid = true; + } + + bool inImage() { return SI.ShType != SHT_NOBITS; } + + hsa_status_t beforeLoading(void *Data, size_t Size) { + if (Valid) { + if (inImage()) { + DP("Setting global device environment before load (%u bytes)\n", + SI.Size); + uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart; + void *Pos = (char *)Data + Offset; + memcpy(Pos, &HostDeviceTimeProfile, SI.Size); + } + } + return HSA_STATUS_SUCCESS; + } + + hsa_status_t afterLoading() { + if (Valid) { + if (!inImage()) { + DP("Setting global device environment after load (%u bytes)\n", + SI.Size); + int DeviceId = HostDeviceTimeProfile.DeviceId; + auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; + void *StatePtr; + uint32_t StatePtrSize; + hsa_status_t Err = interop_hsa_get_symbol_info( + SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize); + if (Err != HSA_STATUS_SUCCESS) { + DP("failed to find %s in loaded image\n", sym()); + return Err; + } + + if (StatePtrSize != SI.Size) { + DP("Symbol had size %u before loading, %u after\n", StatePtrSize, + SI.Size); + return HSA_STATUS_ERROR; + } + + return DeviceInfo().freesignalpoolMemcpyH2D( + StatePtr, &HostDeviceTimeProfile, StatePtrSize, DeviceId); + } + } + return HSA_STATUS_SUCCESS; + } +}; + hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) { uint64_t Rounded = 4 * ((Size + 3) / 4); void *Ptr; Index: openmp/libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -25,6 +25,7 @@ #include "Debug.h" #include "DeviceEnvironment.h" +#include "DeviceTimeProfile.h" #include "omptarget.h" #include "omptargetplugin.h" @@ -35,6 +36,9 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" +DeviceTimeProfileTy DeviceTimeProf; +CUdeviceptr DeviceTimeProfPtr; + using namespace llvm; // Utility for retrieving and printing CUDA error string. @@ -120,6 +124,11 @@ return OFFLOAD_SUCCESS; } +int32_t getTimeProfilePtr(void **Ptr) { + *Ptr = (void *)DeviceTimeProfPtr; + return CUDA_SUCCESS; +} + int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { CUstream Stream = reinterpret_cast(AsyncInfo->Queue); CUevent Event = reinterpret_cast(EventPtr); @@ -972,6 +981,32 @@ } } + { + // TODO: The device ID used here is not the real device ID used by OpenMP. + + const char *DeviceTimeProfName = "omptarget_device_time_profile"; + size_t CUSize; + + Err = cuModuleGetGlobal(&DeviceTimeProfPtr, &CUSize, Module, + DeviceTimeProfName); + if (Err == CUDA_SUCCESS) { + if (CUSize != sizeof(DeviceTimeProfileTy)) { + REPORT( + "Global device_time_profile '%s' - size mismatch (%zu != %zu)\n", + DeviceTimeProfName, CUSize, sizeof(DeviceTimeProfileTy)); + CUDA_ERR_STRING(Err); + return nullptr; + } + + DP("Sending global device time profile data %zu bytes\n", CUSize); + } else { + DP("Finding global device time profile '%s' - symbol missing.\n", + DeviceTimeProfName); + DP("Continue, considering this is a device RTL which does not accept " + "environment setting.\n"); + } + } + return getOffloadEntriesTable(DeviceId); } @@ -1865,6 +1900,11 @@ return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr); } +int32_t __tgt_rtl_get_timeprofile_data(int32_t DeviceId, void **DataPtr) { + // assert(!data_ptr && "data_ptr is not nullptr"); + return getTimeProfilePtr(DataPtr); +} + #ifdef __cplusplus } #endif Index: openmp/libomptarget/plugins/exports =================================================================== --- openmp/libomptarget/plugins/exports +++ openmp/libomptarget/plugins/exports @@ -35,6 +35,7 @@ __tgt_rtl_destroy_event; __tgt_rtl_init_device_info; __tgt_rtl_init_async_info; + __tgt_rtl_get_timeprofile_data; local: *; }; Index: openmp/libomptarget/src/device.cpp =================================================================== --- openmp/libomptarget/src/device.cpp +++ openmp/libomptarget/src/device.cpp @@ -11,17 +11,88 @@ //===----------------------------------------------------------------------===// #include "device.h" +#include "DeviceTimeProfile.h" #include "omptarget.h" #include "private.h" #include "rtl.h" +#include "targetprofile.h" +#include +#include + +#include #include +#include #include +#include #include #include +#include #include #include +CLOCKDEC(); +void sideloadTimeProfileEntries(_OMP::prof::Record Recs[OMP_ARRAY_SIZE], + int total_entries) { + // nanoseconds Multiplier + bool TPINotInit = false; + llvm::TimeTraceProfiler *TPI = llvm::getTimeTraceProfilerInstance(); + const long long int Multiplier = 100000000; + + auto makeRecEntry = [TPI](const _OMP::prof::Record R, size_t Idx, + long long int OffsetNs = 0) { + char TargetDetail[64]; + sprintf(TargetDetail, "Target Thread#%d", R.Thread[Idx]); + long long int Start_ = (R.Start[Idx] * Multiplier) - OffsetNs; + std::chrono::duration> Start_dur( + Start_); + long long int Stop_ = (R.Stop[Idx] * Multiplier) - OffsetNs; + std::chrono::duration> Stop_dur( + Stop_); + + llvm::Entry E(std::chrono::time_point(Start_dur), + std::chrono::time_point(Stop_dur), + std::string("__tgt_op"), std::string(TargetDetail)); + + DurationType Duration = E.End - E.Start; + auto &CountAndTotal = TPI->CountAndTotalPerName[E.Name]; + CountAndTotal.first++; + CountAndTotal.second += Duration; + return E; + }; + + // TPI->BeginningOfTime = + // std::chrono::time_point(BeginningOfTime_Start); + auto now_ms = + std::chrono::time_point_cast(TPI->StartTime); + auto value = now_ms.time_since_epoch(); + long long int duration = value.count(); + auto time_ = Recs[0].Start[0] * 1000000; + if (TPI == nullptr) { + llvm::TimeTraceProfiler TPV = + llvm::TimeTraceProfiler(0, "Target Operations"); + TPI = &TPV; + TPINotInit = true; + } + for (size_t i = 0; i < total_entries; i++) { + for (size_t j = 0; j < OMP_THREADS; j++) { + TPI->Entries.emplace_back(makeRecEntry(Recs[i], j)); + } + } + if (TPINotInit) + llvm::Instances.List.emplace_back(TPI); +} + +void clockFinalize(DeviceTy *Device) { + int32_t Idx = omptarget_device_time_profile.Idx; + assert(Idx >= 0 && Idx < OMP_ARRAY_SIZE && "Out of beyond bounds"); + // TODO: use env var + if (Idx > 0) { + sideloadTimeProfileEntries(omptarget_device_time_profile.Records, Idx); + } else + printf("No Target\n"); +} + int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device, AsyncInfoTy &AsyncInfo) const { // First, check if the user disabled atomic map transfer/malloc/dealloc. @@ -55,6 +126,7 @@ PendingGlobalsMtx(), ShadowMtx() {} DeviceTy::~DeviceTy() { + CLOCKFINALIZE(); if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)) return; @@ -503,6 +575,7 @@ /// Thread-safe method to initialize the device only once. int32_t DeviceTy::initOnce() { std::call_once(InitFlag, &DeviceTy::init, this); + CLOCKINIT(); // At this point, if IsInit is true, then either this thread or some other // thread in the past successfully initialized the device, so we can return @@ -556,6 +629,12 @@ AsyncInfo); } +int32_t DeviceTy::getTimeProfilePtr(void **ptr) { + if (RTL->time_profile_ptr) + return RTL->time_profile_ptr(RTLDeviceID, ptr); + return OFFLOAD_SUCCESS; +} + // Retrieve data from device int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, AsyncInfoTy &AsyncInfo) { Index: openmp/libomptarget/src/interface.cpp =================================================================== --- openmp/libomptarget/src/interface.cpp +++ openmp/libomptarget/src/interface.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "DeviceTimeProfile.h" #include "device.h" #include "omptarget.h" #include "private.h" @@ -21,6 +22,8 @@ #include #include +extern DeviceTimeProfileTy omptarget_device_time_profile; + //////////////////////////////////////////////////////////////////////////////// /// adds requires flags EXTERN void __tgt_register_requires(int64_t Flags) { @@ -111,6 +114,19 @@ ArgSizes, ArgTypes, ArgNames, ArgMappers); } +// At device deconstructor context gets destroyed. To work around this the +// collected data is synced target -> host after every kernel deinit. +void clockSync(DeviceTy *Device) { + AsyncInfoTy async_info(*Device); + void *tgt_ptr = nullptr; + void *hst_ptr = &omptarget_device_time_profile; + Device->getTimeProfilePtr(&tgt_ptr); + Device->retrieveData(hst_ptr, tgt_ptr, sizeof(DeviceTimeProfileTy), + async_info); + int32_t Idx = omptarget_device_time_profile.Idx; + assert(Idx > 0 && Idx < OMP_ARRAY_SIZE && "Out of beyond bounds"); +} + /// passes data from the target, releases target memory and destroys /// the host-target mapping (top entry from the stack of data maps) /// created by the last __tgt_target_data_begin. @@ -142,6 +158,7 @@ #endif AsyncInfoTy AsyncInfo(Device); + CLOCKSYNC(); int Rc = targetDataEnd(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) Index: openmp/libomptarget/src/rtl.cpp =================================================================== --- openmp/libomptarget/src/rtl.cpp +++ openmp/libomptarget/src/rtl.cpp @@ -242,6 +242,8 @@ DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info"); *((void **)&R.init_device_info) = DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info"); + *((void **)&R.time_profile_ptr) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_get_timeprofile_data"); R.LibraryHandler = std::move(DynLibrary); }