diff --git a/openmp/tools/archer/README.md b/openmp/tools/archer/README.md --- a/openmp/tools/archer/README.md +++ b/openmp/tools/archer/README.md @@ -131,6 +131,15 @@ + + +report_data_leak +0 +Report leaking OMPT data for execution under +Archer. Used for testing and debugging Archer if errors occur. + + + verbose diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp --- a/openmp/tools/archer/ompt-tsan.cpp +++ b/openmp/tools/archer/ompt-tsan.cpp @@ -24,8 +24,9 @@ #include #include #include -#include #include +#include +#include #include #include @@ -34,7 +35,6 @@ #endif #include "omp-tools.h" -#include // Define attribute that indicates that the fall through from the previous // case label is intentional and should not be diagnosed by a compiler @@ -61,6 +61,7 @@ int print_max_rss{0}; int verbose{0}; int enabled{1}; + int report_data_leak{0}; int ignore_serial{0}; ArcherFlags(const char *env) { @@ -82,6 +83,8 @@ continue; if (sscanf(it->c_str(), "verbose=%d", &verbose)) continue; + if (sscanf(it->c_str(), "report_data_leak=%d", &report_data_leak)) + continue; if (sscanf(it->c_str(), "enable=%d", &enabled)) continue; if (sscanf(it->c_str(), "ignore_serial=%d", &ignore_serial)) @@ -214,7 +217,7 @@ static ompt_get_parallel_info_t ompt_get_parallel_info; static ompt_get_thread_data_t ompt_get_thread_data; -typedef uint64_t ompt_tsan_clockid; +typedef char ompt_tsan_clockid; static uint64_t my_next_id() { static uint64_t ID = 0; @@ -222,100 +225,132 @@ return ret; } +static int pagesize{0}; + // Data structure to provide a threadsafe pool of reusable objects. -// DataPool -template struct DataPool { - std::mutex DPMutex; - std::stack DataPointer; +// DataPool +template struct DataPool final { + static __thread DataPool *ThreadDataPool; + std::mutex DPMutex{}; + + // store unused objects + std::vector DataPointer{}; + std::vector RemoteDataPointer{}; + + // store all allocated memory to finally release std::list memory; - int total; + // count remotely returned data (RemoteDataPointer.size()) + std::atomic remote{0}; + + // totally allocated data objects in pool + int total{0}; +#ifdef DEBUG_DATA + int remoteReturn{0}; + int localReturn{0}; + + int getRemote() { return remoteReturn + remote; } + int getLocal() { return localReturn; } +#endif + int getTotal() { return total; } + int getMissing() { + return total - DataPointer.size() - RemoteDataPointer.size(); + } + + // fill the pool by allocating a page of memory void newDatas() { - // prefix the Data with a pointer to 'this', allows to return memory to - // 'this', - // without explicitly knowing the source. - // - // To reduce lock contention, we use thread local DataPools, but Data - // objects move to other threads. - // The strategy is to get objects from local pool. Only if the object moved - // to another - // thread, we might see a penalty on release (returnData). - // For "single producer" pattern, a single thread creates tasks, these are - // executed by other threads. - // The master will have a high demand on TaskData, so return after use. - struct pooldata { - DataPool *dp; - T data; - }; - // We alloc without initialize the memory. We cannot call constructors. - // Therefore use malloc! - pooldata *datas = (pooldata *)malloc(sizeof(pooldata) * N); + if (remote > 0) { + const std::lock_guard lock(DPMutex); + // DataPointer is empty, so just swap the vectors + DataPointer.swap(RemoteDataPointer); + remote = 0; + return; + } + // calculate size of an object including padding to cacheline size + size_t elemSize = sizeof(T); + size_t paddedSize = (((elemSize - 1) / 64) + 1) * 64; + // number of padded elements to allocate + int ndatas = pagesize / paddedSize; + char *datas = (char *)malloc(ndatas * paddedSize); memory.push_back(datas); - for (int i = 0; i < N; i++) { - datas[i].dp = this; - DataPointer.push(&(datas[i].data)); + for (int i = 0; i < ndatas; i++) { + DataPointer.push_back(new (datas + i * paddedSize) T(this)); } - total += N; + total += ndatas; } + // get data from the pool T *getData() { T *ret; - DPMutex.lock(); if (DataPointer.empty()) newDatas(); - ret = DataPointer.top(); - DataPointer.pop(); - DPMutex.unlock(); + ret = DataPointer.back(); + DataPointer.pop_back(); return ret; } - void returnData(T *data) { - DPMutex.lock(); - DataPointer.push(data); - DPMutex.unlock(); - } - - void getDatas(int n, T **datas) { - DPMutex.lock(); - for (int i = 0; i < n; i++) { - if (DataPointer.empty()) - newDatas(); - datas[i] = DataPointer.top(); - DataPointer.pop(); - } - DPMutex.unlock(); + // accesses to the thread-local datapool don't need locks + void returnOwnData(T *data) { + DataPointer.emplace_back(data); +#ifdef DEBUG_DATA + localReturn++; +#endif } - void returnDatas(int n, T **datas) { - DPMutex.lock(); - for (int i = 0; i < n; i++) { - DataPointer.push(datas[i]); - } - DPMutex.unlock(); + // returning to a remote datapool using lock + void returnData(T *data) { + const std::lock_guard lock(DPMutex); + RemoteDataPointer.emplace_back(data); + remote++; +#ifdef DEBUG_DATA + remoteReturn++; +#endif } - DataPool() : DPMutex(), DataPointer(), total(0) {} - ~DataPool() { // we assume all memory is returned when the thread finished / destructor is // called + if (archer_flags->report_data_leak && getMissing() != 0) { + printf("ERROR: While freeing DataPool (%s) we are missing %i data " + "objects.\n", + __PRETTY_FUNCTION__, getMissing()); + exit(-3); + } + for (auto i : DataPointer) + if (i) + i->~T(); + for (auto i : RemoteDataPointer) + if (i) + i->~T(); for (auto i : memory) if (i) free(i); } }; -// This function takes care to return the data to the originating DataPool -// A pointer to the originating DataPool is stored just before the actual data. -template static void retData(void *data) { - ((DataPool **)data)[-1]->returnData((T *)data); -} +template struct DataPoolEntry { + DataPool *owner; + + static T *New() { return DataPool::ThreadDataPool->getData(); } + + void Delete() { + static_cast(this)->Reset(); + if (owner == DataPool::ThreadDataPool) + owner->returnOwnData(static_cast(this)); + else + owner->returnData(static_cast(this)); + } + + DataPoolEntry(DataPool *dp) : owner(dp) {} +}; struct ParallelData; -__thread DataPool *pdp; +typedef DataPool ParallelDataPool; +template <> +__thread ParallelDataPool *ParallelDataPool::ThreadDataPool = nullptr; /// Data structure to store additional information for parallel regions. -struct ParallelData { +struct ParallelData final : DataPoolEntry { // Parallel fork is just another barrier, use Barrier[1] @@ -328,14 +363,18 @@ void *GetBarrierPtr(unsigned Index) { return &(Barrier[Index]); } - ParallelData(const void *codeptr) : codePtr(codeptr) {} - ~ParallelData() { - TsanDeleteClock(&(Barrier[0])); - TsanDeleteClock(&(Barrier[1])); + ParallelData *Init(const void *codeptr) { + codePtr = codeptr; + return this; } - // overload new/delete to use DataPool for memory management. - void *operator new(size_t size) { return pdp->getData(); } - void operator delete(void *p, size_t) { retData(p); } + + void Reset() {} + + static ParallelData *New(const void *codeptr) { + return DataPoolEntry::New()->Init(codeptr); + } + + ParallelData(DataPool *dp) : DataPoolEntry(dp) {} }; static inline ParallelData *ToParallelData(ompt_data_t *parallel_data) { @@ -343,95 +382,83 @@ } struct Taskgroup; -__thread DataPool *tgp; +typedef DataPool TaskgroupPool; +template <> __thread TaskgroupPool *TaskgroupPool::ThreadDataPool = nullptr; /// Data structure to support stacking of taskgroups and allow synchronization. -struct Taskgroup { +struct Taskgroup final : DataPoolEntry { /// Its address is used for relationships of the taskgroup's task set. ompt_tsan_clockid Ptr; /// Reference to the parent taskgroup. Taskgroup *Parent; - Taskgroup(Taskgroup *Parent) : Parent(Parent) {} - ~Taskgroup() { TsanDeleteClock(&Ptr); } - void *GetPtr() { return &Ptr; } - // overload new/delete to use DataPool for memory management. - void *operator new(size_t size) { return tgp->getData(); } - void operator delete(void *p, size_t) { retData(p); } + + Taskgroup *Init(Taskgroup *parent) { + Parent = parent; + return this; + } + + void Reset() {} + + static Taskgroup *New(Taskgroup *Parent) { + return DataPoolEntry::New()->Init(Parent); + } + + Taskgroup(DataPool *dp) : DataPoolEntry(dp) {} }; struct TaskData; -__thread DataPool *tdp; +typedef DataPool TaskDataPool; +template <> __thread TaskDataPool *TaskDataPool::ThreadDataPool = nullptr; /// Data structure to store additional information for tasks. -struct TaskData { +struct TaskData final : DataPoolEntry { /// Its address is used for relationships of this task. - ompt_tsan_clockid Task; + ompt_tsan_clockid Task{0}; /// Child tasks use its address to declare a relationship to a taskwait in /// this task. - ompt_tsan_clockid Taskwait; + ompt_tsan_clockid Taskwait{0}; /// Whether this task is currently executing a barrier. - bool InBarrier; + bool InBarrier{false}; /// Whether this task is an included task. int TaskType{0}; + /// count execution phase + int execution{0}; + /// Index of which barrier to use next. - char BarrierIndex; + char BarrierIndex{0}; /// Count how often this structure has been put into child tasks + 1. - std::atomic_int RefCount; + std::atomic_int RefCount{1}; /// Reference to the parent that created this task. - TaskData *Parent; + TaskData *Parent{nullptr}; /// Reference to the implicit task in the stack above this task. - TaskData *ImplicitTask; + TaskData *ImplicitTask{nullptr}; /// Reference to the team of this task. - ParallelData *Team; + ParallelData *Team{nullptr}; /// Reference to the current taskgroup that this task either belongs to or /// that it just created. - Taskgroup *TaskGroup; + Taskgroup *TaskGroup{nullptr}; /// Dependency information for this task. - ompt_dependence_t *Dependencies; + ompt_dependence_t *Dependencies{nullptr}; /// Number of dependency entries. - unsigned DependencyCount; - - void *PrivateData; - size_t PrivateDataSize; - - int execution; - int freed; + unsigned DependencyCount{0}; - TaskData(TaskData *Parent, int taskType) - : InBarrier(false), TaskType(taskType), BarrierIndex(0), RefCount(1), - Parent(Parent), ImplicitTask(nullptr), Team(Parent->Team), - TaskGroup(nullptr), DependencyCount(0), execution(0), freed(0) { - if (Parent != nullptr) { - Parent->RefCount++; - // Copy over pointer to taskgroup. This task may set up its own stack - // but for now belongs to its parent's taskgroup. - TaskGroup = Parent->TaskGroup; - } - } - - TaskData(ParallelData *Team, int taskType) - : InBarrier(false), TaskType(taskType), BarrierIndex(0), RefCount(1), - Parent(nullptr), ImplicitTask(this), Team(Team), TaskGroup(nullptr), - DependencyCount(0), execution(1), freed(0) {} - - ~TaskData() { - TsanDeleteClock(&Task); - TsanDeleteClock(&Taskwait); - } +#ifdef DEBUG + int freed{0}; +#endif bool isIncluded() { return TaskType & ompt_task_undeferred; } bool isUntied() { return TaskType & ompt_task_untied; } @@ -447,9 +474,54 @@ void *GetTaskPtr() { return &Task; } void *GetTaskwaitPtr() { return &Taskwait; } - // overload new/delete to use DataPool for memory management. - void *operator new(size_t size) { return tdp->getData(); } - void operator delete(void *p, size_t) { retData(p); } + + TaskData *Init(TaskData *parent, int taskType) { + TaskType = taskType; + Parent = parent; + Team = Parent->Team; + if (Parent != nullptr) { + Parent->RefCount++; + // Copy over pointer to taskgroup. This task may set up its own stack + // but for now belongs to its parent's taskgroup. + TaskGroup = Parent->TaskGroup; + } + return this; + } + + TaskData *Init(ParallelData *team, int taskType) { + TaskType = taskType; + execution = 1; + ImplicitTask = this; + Team = team; + return this; + } + + void Reset() { + InBarrier = false; + TaskType = 0; + execution = 0; + BarrierIndex = 0; + RefCount = 1; + Parent = nullptr; + ImplicitTask = nullptr; + Team = nullptr; + TaskGroup = nullptr; + Dependencies = nullptr; + DependencyCount = 0; +#ifdef DEBUG + freed = 0; +#endif + } + + static TaskData *New(TaskData *parent, int taskType) { + return DataPoolEntry::New()->Init(parent, taskType); + } + + static TaskData *New(ParallelData *team, int taskType) { + return DataPoolEntry::New()->Init(team, taskType); + } + + TaskData(DataPool *dp) : DataPoolEntry(dp) {} }; static inline TaskData *ToTaskData(ompt_data_t *task_data) { @@ -470,19 +542,22 @@ static void ompt_tsan_thread_begin(ompt_thread_t thread_type, ompt_data_t *thread_data) { - pdp = new DataPool; - TsanNewMemory(pdp, sizeof(pdp)); - tgp = new DataPool; - TsanNewMemory(tgp, sizeof(tgp)); - tdp = new DataPool; - TsanNewMemory(tdp, sizeof(tdp)); + ParallelDataPool::ThreadDataPool = new ParallelDataPool; + TsanNewMemory(ParallelDataPool::ThreadDataPool, + sizeof(ParallelDataPool::ThreadDataPool)); + TaskgroupPool::ThreadDataPool = new TaskgroupPool; + TsanNewMemory(TaskgroupPool::ThreadDataPool, + sizeof(TaskgroupPool::ThreadDataPool)); + TaskDataPool::ThreadDataPool = new TaskDataPool; + TsanNewMemory(TaskDataPool::ThreadDataPool, + sizeof(TaskDataPool::ThreadDataPool)); thread_data->value = my_next_id(); } static void ompt_tsan_thread_end(ompt_data_t *thread_data) { - delete pdp; - delete tgp; - delete tdp; + delete ParallelDataPool::ThreadDataPool; + delete TaskgroupPool::ThreadDataPool; + delete TaskDataPool::ThreadDataPool; } /// OMPT event callbacks for handling parallel regions. @@ -492,7 +567,7 @@ ompt_data_t *parallel_data, uint32_t requested_team_size, int flag, const void *codeptr_ra) { - ParallelData *Data = new ParallelData(codeptr_ra); + ParallelData *Data = ParallelData::New(codeptr_ra); parallel_data->ptr = Data; TsanHappensBefore(Data->GetParallelPtr()); @@ -509,7 +584,7 @@ TsanHappensAfter(Data->GetBarrierPtr(0)); TsanHappensAfter(Data->GetBarrierPtr(1)); - delete Data; + Data->Delete(); #if (LLVM_VERSION >= 40) if (&__archer_get_omp_status) { @@ -527,19 +602,24 @@ switch (endpoint) { case ompt_scope_begin: if (type & ompt_task_initial) { - parallel_data->ptr = new ParallelData(nullptr); + parallel_data->ptr = ParallelData::New(nullptr); } - task_data->ptr = new TaskData(ToParallelData(parallel_data), type); + task_data->ptr = TaskData::New(ToParallelData(parallel_data), type); TsanHappensAfter(ToParallelData(parallel_data)->GetParallelPtr()); TsanFuncEntry(ToParallelData(parallel_data)->codePtr); break; case ompt_scope_end: { TaskData *Data = ToTaskData(task_data); +#ifdef DEBUG assert(Data->freed == 0 && "Implicit task end should only be called once!"); Data->freed = 1; +#endif assert(Data->RefCount == 1 && "All tasks should have finished at the implicit barrier!"); - delete Data; + Data->Delete(); + if (type & ompt_task_initial) { + ToParallelData(parallel_data)->Delete(); + } TsanFuncExit(); break; } @@ -588,7 +668,7 @@ break; case ompt_sync_region_taskgroup: - Data->TaskGroup = new Taskgroup(Data->TaskGroup); + Data->TaskGroup = Taskgroup::New(Data->TaskGroup); break; case ompt_sync_region_reduction: @@ -643,7 +723,7 @@ // Delete this allocated taskgroup, all descendent task are finished by // now. Taskgroup *Parent = Data->TaskGroup->Parent; - delete Data->TaskGroup; + Data->TaskGroup->Delete(); Data->TaskGroup = Parent; break; } @@ -705,16 +785,16 @@ ompt_data_t *parallel_data; int team_size = 1; ompt_get_parallel_info(0, ¶llel_data, &team_size); - ParallelData *PData = new ParallelData(nullptr); + ParallelData *PData = ParallelData::New(nullptr); parallel_data->ptr = PData; - Data = new TaskData(PData, type); + Data = TaskData::New(PData, type); new_task_data->ptr = Data; } else if (type & ompt_task_undeferred) { - Data = new TaskData(ToTaskData(parent_task_data), type); + Data = TaskData::New(ToTaskData(parent_task_data), type); new_task_data->ptr = Data; } else if (type & ompt_task_explicit || type & ompt_task_target) { - Data = new TaskData(ToTaskData(parent_task_data), type); + Data = TaskData::New(ToTaskData(parent_task_data), type); new_task_data->ptr = Data; // Use the newly created address. We cannot use a single address from the @@ -731,7 +811,7 @@ if (task->DependencyCount > 0) { delete[] task->Dependencies; } - delete task; + task->Delete(); task = Parent; } } @@ -1006,6 +1086,8 @@ return NULL; } + pagesize = getpagesize(); + static ompt_start_tool_result_t ompt_start_tool_result = { &ompt_tsan_initialize, &ompt_tsan_finalize, {0}}; diff --git a/openmp/tools/archer/tests/lit.cfg b/openmp/tools/archer/tests/lit.cfg --- a/openmp/tools/archer/tests/lit.cfg +++ b/openmp/tools/archer/tests/lit.cfg @@ -90,9 +90,12 @@ if 'INTEL_LICENSE_FILE' in os.environ: config.environment['INTEL_LICENSE_FILE'] = os.environ['INTEL_LICENSE_FILE'] +config.environment['ARCHER_OPTIONS'] = "report_data_leak=1" + # Race Tests config.substitutions.append(("%libarcher-compile-and-run-race-noserial", \ - "%libarcher-compile && env ARCHER_OPTIONS=ignore_serial=1 %libarcher-run-race")) + "%%libarcher-compile && env ARCHER_OPTIONS=\"ignore_serial=1 %s\" %%libarcher-run-race" \ + % config.environment['ARCHER_OPTIONS'])) config.substitutions.append(("%libarcher-compile-and-run-race", \ "%libarcher-compile && %libarcher-run-race")) config.substitutions.append(("%libarcher-compile-and-run-nosuppression", \