diff --git a/openmp/tools/archer/README.md b/openmp/tools/archer/README.md
--- a/openmp/tools/archer/README.md
+++ b/openmp/tools/archer/README.md
@@ -131,6 +131,15 @@
+
+
+report_data_leak |
+0 |
+Report leaking OMPT data for execution under
+Archer. Used for testing and debugging Archer if errors occur. |
+
+
+
verbose |
diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp
--- a/openmp/tools/archer/ompt-tsan.cpp
+++ b/openmp/tools/archer/ompt-tsan.cpp
@@ -24,8 +24,9 @@
#include
#include
#include
-#include
#include
+#include
+#include
#include
#include
@@ -34,7 +35,6 @@
#endif
#include "omp-tools.h"
-#include
// Define attribute that indicates that the fall through from the previous
// case label is intentional and should not be diagnosed by a compiler
@@ -61,6 +61,7 @@
int print_max_rss{0};
int verbose{0};
int enabled{1};
+ int report_data_leak{0};
int ignore_serial{0};
ArcherFlags(const char *env) {
@@ -82,6 +83,8 @@
continue;
if (sscanf(it->c_str(), "verbose=%d", &verbose))
continue;
+ if (sscanf(it->c_str(), "report_data_leak=%d", &report_data_leak))
+ continue;
if (sscanf(it->c_str(), "enable=%d", &enabled))
continue;
if (sscanf(it->c_str(), "ignore_serial=%d", &ignore_serial))
@@ -214,7 +217,7 @@
static ompt_get_parallel_info_t ompt_get_parallel_info;
static ompt_get_thread_data_t ompt_get_thread_data;
-typedef uint64_t ompt_tsan_clockid;
+typedef char ompt_tsan_clockid;
static uint64_t my_next_id() {
static uint64_t ID = 0;
@@ -222,100 +225,132 @@
return ret;
}
+static int pagesize{0};
+
// Data structure to provide a threadsafe pool of reusable objects.
-// DataPool
-template struct DataPool {
- std::mutex DPMutex;
- std::stack DataPointer;
+// DataPool
+template struct DataPool final {
+ static __thread DataPool *ThreadDataPool;
+ std::mutex DPMutex{};
+
+ // store unused objects
+ std::vector DataPointer{};
+ std::vector RemoteDataPointer{};
+
+ // store all allocated memory to finally release
std::list memory;
- int total;
+ // count remotely returned data (RemoteDataPointer.size())
+ std::atomic remote{0};
+
+ // totally allocated data objects in pool
+ int total{0};
+#ifdef DEBUG_DATA
+ int remoteReturn{0};
+ int localReturn{0};
+
+ int getRemote() { return remoteReturn + remote; }
+ int getLocal() { return localReturn; }
+#endif
+ int getTotal() { return total; }
+ int getMissing() {
+ return total - DataPointer.size() - RemoteDataPointer.size();
+ }
+
+ // fill the pool by allocating a page of memory
void newDatas() {
- // prefix the Data with a pointer to 'this', allows to return memory to
- // 'this',
- // without explicitly knowing the source.
- //
- // To reduce lock contention, we use thread local DataPools, but Data
- // objects move to other threads.
- // The strategy is to get objects from local pool. Only if the object moved
- // to another
- // thread, we might see a penalty on release (returnData).
- // For "single producer" pattern, a single thread creates tasks, these are
- // executed by other threads.
- // The master will have a high demand on TaskData, so return after use.
- struct pooldata {
- DataPool *dp;
- T data;
- };
- // We alloc without initialize the memory. We cannot call constructors.
- // Therefore use malloc!
- pooldata *datas = (pooldata *)malloc(sizeof(pooldata) * N);
+ if (remote > 0) {
+ const std::lock_guard lock(DPMutex);
+ // DataPointer is empty, so just swap the vectors
+ DataPointer.swap(RemoteDataPointer);
+ remote = 0;
+ return;
+ }
+ // calculate size of an object including padding to cacheline size
+ size_t elemSize = sizeof(T);
+ size_t paddedSize = (((elemSize - 1) / 64) + 1) * 64;
+ // number of padded elements to allocate
+ int ndatas = pagesize / paddedSize;
+ char *datas = (char *)malloc(ndatas * paddedSize);
memory.push_back(datas);
- for (int i = 0; i < N; i++) {
- datas[i].dp = this;
- DataPointer.push(&(datas[i].data));
+ for (int i = 0; i < ndatas; i++) {
+ DataPointer.push_back(new (datas + i * paddedSize) T(this));
}
- total += N;
+ total += ndatas;
}
+ // get data from the pool
T *getData() {
T *ret;
- DPMutex.lock();
if (DataPointer.empty())
newDatas();
- ret = DataPointer.top();
- DataPointer.pop();
- DPMutex.unlock();
+ ret = DataPointer.back();
+ DataPointer.pop_back();
return ret;
}
- void returnData(T *data) {
- DPMutex.lock();
- DataPointer.push(data);
- DPMutex.unlock();
- }
-
- void getDatas(int n, T **datas) {
- DPMutex.lock();
- for (int i = 0; i < n; i++) {
- if (DataPointer.empty())
- newDatas();
- datas[i] = DataPointer.top();
- DataPointer.pop();
- }
- DPMutex.unlock();
+ // accesses to the thread-local datapool don't need locks
+ void returnOwnData(T *data) {
+ DataPointer.emplace_back(data);
+#ifdef DEBUG_DATA
+ localReturn++;
+#endif
}
- void returnDatas(int n, T **datas) {
- DPMutex.lock();
- for (int i = 0; i < n; i++) {
- DataPointer.push(datas[i]);
- }
- DPMutex.unlock();
+ // returning to a remote datapool using lock
+ void returnData(T *data) {
+ const std::lock_guard lock(DPMutex);
+ RemoteDataPointer.emplace_back(data);
+ remote++;
+#ifdef DEBUG_DATA
+ remoteReturn++;
+#endif
}
- DataPool() : DPMutex(), DataPointer(), total(0) {}
-
~DataPool() {
// we assume all memory is returned when the thread finished / destructor is
// called
+ if (archer_flags->report_data_leak && getMissing() != 0) {
+ printf("ERROR: While freeing DataPool (%s) we are missing %i data "
+ "objects.\n",
+ __PRETTY_FUNCTION__, getMissing());
+ exit(-3);
+ }
+ for (auto i : DataPointer)
+ if (i)
+ i->~T();
+ for (auto i : RemoteDataPointer)
+ if (i)
+ i->~T();
for (auto i : memory)
if (i)
free(i);
}
};
-// This function takes care to return the data to the originating DataPool
-// A pointer to the originating DataPool is stored just before the actual data.
-template static void retData(void *data) {
- ((DataPool **)data)[-1]->returnData((T *)data);
-}
+template struct DataPoolEntry {
+ DataPool *owner;
+
+ static T *New() { return DataPool::ThreadDataPool->getData(); }
+
+ void Delete() {
+ static_cast(this)->Reset();
+ if (owner == DataPool::ThreadDataPool)
+ owner->returnOwnData(static_cast(this));
+ else
+ owner->returnData(static_cast(this));
+ }
+
+ DataPoolEntry(DataPool *dp) : owner(dp) {}
+};
struct ParallelData;
-__thread DataPool *pdp;
+typedef DataPool ParallelDataPool;
+template <>
+__thread ParallelDataPool *ParallelDataPool::ThreadDataPool = nullptr;
/// Data structure to store additional information for parallel regions.
-struct ParallelData {
+struct ParallelData final : DataPoolEntry {
// Parallel fork is just another barrier, use Barrier[1]
@@ -328,14 +363,18 @@
void *GetBarrierPtr(unsigned Index) { return &(Barrier[Index]); }
- ParallelData(const void *codeptr) : codePtr(codeptr) {}
- ~ParallelData() {
- TsanDeleteClock(&(Barrier[0]));
- TsanDeleteClock(&(Barrier[1]));
+ ParallelData *Init(const void *codeptr) {
+ codePtr = codeptr;
+ return this;
}
- // overload new/delete to use DataPool for memory management.
- void *operator new(size_t size) { return pdp->getData(); }
- void operator delete(void *p, size_t) { retData(p); }
+
+ void Reset() {}
+
+ static ParallelData *New(const void *codeptr) {
+ return DataPoolEntry::New()->Init(codeptr);
+ }
+
+ ParallelData(DataPool *dp) : DataPoolEntry(dp) {}
};
static inline ParallelData *ToParallelData(ompt_data_t *parallel_data) {
@@ -343,95 +382,83 @@
}
struct Taskgroup;
-__thread DataPool *tgp;
+typedef DataPool TaskgroupPool;
+template <> __thread TaskgroupPool *TaskgroupPool::ThreadDataPool = nullptr;
/// Data structure to support stacking of taskgroups and allow synchronization.
-struct Taskgroup {
+struct Taskgroup final : DataPoolEntry {
/// Its address is used for relationships of the taskgroup's task set.
ompt_tsan_clockid Ptr;
/// Reference to the parent taskgroup.
Taskgroup *Parent;
- Taskgroup(Taskgroup *Parent) : Parent(Parent) {}
- ~Taskgroup() { TsanDeleteClock(&Ptr); }
-
void *GetPtr() { return &Ptr; }
- // overload new/delete to use DataPool for memory management.
- void *operator new(size_t size) { return tgp->getData(); }
- void operator delete(void *p, size_t) { retData(p); }
+
+ Taskgroup *Init(Taskgroup *parent) {
+ Parent = parent;
+ return this;
+ }
+
+ void Reset() {}
+
+ static Taskgroup *New(Taskgroup *Parent) {
+ return DataPoolEntry::New()->Init(Parent);
+ }
+
+ Taskgroup(DataPool *dp) : DataPoolEntry(dp) {}
};
struct TaskData;
-__thread DataPool *tdp;
+typedef DataPool TaskDataPool;
+template <> __thread TaskDataPool *TaskDataPool::ThreadDataPool = nullptr;
/// Data structure to store additional information for tasks.
-struct TaskData {
+struct TaskData final : DataPoolEntry {
/// Its address is used for relationships of this task.
- ompt_tsan_clockid Task;
+ ompt_tsan_clockid Task{0};
/// Child tasks use its address to declare a relationship to a taskwait in
/// this task.
- ompt_tsan_clockid Taskwait;
+ ompt_tsan_clockid Taskwait{0};
/// Whether this task is currently executing a barrier.
- bool InBarrier;
+ bool InBarrier{false};
/// Whether this task is an included task.
int TaskType{0};
+ /// count execution phase
+ int execution{0};
+
/// Index of which barrier to use next.
- char BarrierIndex;
+ char BarrierIndex{0};
/// Count how often this structure has been put into child tasks + 1.
- std::atomic_int RefCount;
+ std::atomic_int RefCount{1};
/// Reference to the parent that created this task.
- TaskData *Parent;
+ TaskData *Parent{nullptr};
/// Reference to the implicit task in the stack above this task.
- TaskData *ImplicitTask;
+ TaskData *ImplicitTask{nullptr};
/// Reference to the team of this task.
- ParallelData *Team;
+ ParallelData *Team{nullptr};
/// Reference to the current taskgroup that this task either belongs to or
/// that it just created.
- Taskgroup *TaskGroup;
+ Taskgroup *TaskGroup{nullptr};
/// Dependency information for this task.
- ompt_dependence_t *Dependencies;
+ ompt_dependence_t *Dependencies{nullptr};
/// Number of dependency entries.
- unsigned DependencyCount;
-
- void *PrivateData;
- size_t PrivateDataSize;
-
- int execution;
- int freed;
+ unsigned DependencyCount{0};
- TaskData(TaskData *Parent, int taskType)
- : InBarrier(false), TaskType(taskType), BarrierIndex(0), RefCount(1),
- Parent(Parent), ImplicitTask(nullptr), Team(Parent->Team),
- TaskGroup(nullptr), DependencyCount(0), execution(0), freed(0) {
- if (Parent != nullptr) {
- Parent->RefCount++;
- // Copy over pointer to taskgroup. This task may set up its own stack
- // but for now belongs to its parent's taskgroup.
- TaskGroup = Parent->TaskGroup;
- }
- }
-
- TaskData(ParallelData *Team, int taskType)
- : InBarrier(false), TaskType(taskType), BarrierIndex(0), RefCount(1),
- Parent(nullptr), ImplicitTask(this), Team(Team), TaskGroup(nullptr),
- DependencyCount(0), execution(1), freed(0) {}
-
- ~TaskData() {
- TsanDeleteClock(&Task);
- TsanDeleteClock(&Taskwait);
- }
+#ifdef DEBUG
+ int freed{0};
+#endif
bool isIncluded() { return TaskType & ompt_task_undeferred; }
bool isUntied() { return TaskType & ompt_task_untied; }
@@ -447,9 +474,54 @@
void *GetTaskPtr() { return &Task; }
void *GetTaskwaitPtr() { return &Taskwait; }
- // overload new/delete to use DataPool for memory management.
- void *operator new(size_t size) { return tdp->getData(); }
- void operator delete(void *p, size_t) { retData(p); }
+
+ TaskData *Init(TaskData *parent, int taskType) {
+ TaskType = taskType;
+ Parent = parent;
+ Team = Parent->Team;
+ if (Parent != nullptr) {
+ Parent->RefCount++;
+ // Copy over pointer to taskgroup. This task may set up its own stack
+ // but for now belongs to its parent's taskgroup.
+ TaskGroup = Parent->TaskGroup;
+ }
+ return this;
+ }
+
+ TaskData *Init(ParallelData *team, int taskType) {
+ TaskType = taskType;
+ execution = 1;
+ ImplicitTask = this;
+ Team = team;
+ return this;
+ }
+
+ void Reset() {
+ InBarrier = false;
+ TaskType = 0;
+ execution = 0;
+ BarrierIndex = 0;
+ RefCount = 1;
+ Parent = nullptr;
+ ImplicitTask = nullptr;
+ Team = nullptr;
+ TaskGroup = nullptr;
+ Dependencies = nullptr;
+ DependencyCount = 0;
+#ifdef DEBUG
+ freed = 0;
+#endif
+ }
+
+ static TaskData *New(TaskData *parent, int taskType) {
+ return DataPoolEntry::New()->Init(parent, taskType);
+ }
+
+ static TaskData *New(ParallelData *team, int taskType) {
+ return DataPoolEntry::New()->Init(team, taskType);
+ }
+
+ TaskData(DataPool *dp) : DataPoolEntry(dp) {}
};
static inline TaskData *ToTaskData(ompt_data_t *task_data) {
@@ -470,19 +542,22 @@
static void ompt_tsan_thread_begin(ompt_thread_t thread_type,
ompt_data_t *thread_data) {
- pdp = new DataPool;
- TsanNewMemory(pdp, sizeof(pdp));
- tgp = new DataPool;
- TsanNewMemory(tgp, sizeof(tgp));
- tdp = new DataPool;
- TsanNewMemory(tdp, sizeof(tdp));
+ ParallelDataPool::ThreadDataPool = new ParallelDataPool;
+ TsanNewMemory(ParallelDataPool::ThreadDataPool,
+ sizeof(ParallelDataPool::ThreadDataPool));
+ TaskgroupPool::ThreadDataPool = new TaskgroupPool;
+ TsanNewMemory(TaskgroupPool::ThreadDataPool,
+ sizeof(TaskgroupPool::ThreadDataPool));
+ TaskDataPool::ThreadDataPool = new TaskDataPool;
+ TsanNewMemory(TaskDataPool::ThreadDataPool,
+ sizeof(TaskDataPool::ThreadDataPool));
thread_data->value = my_next_id();
}
static void ompt_tsan_thread_end(ompt_data_t *thread_data) {
- delete pdp;
- delete tgp;
- delete tdp;
+ delete ParallelDataPool::ThreadDataPool;
+ delete TaskgroupPool::ThreadDataPool;
+ delete TaskDataPool::ThreadDataPool;
}
/// OMPT event callbacks for handling parallel regions.
@@ -492,7 +567,7 @@
ompt_data_t *parallel_data,
uint32_t requested_team_size, int flag,
const void *codeptr_ra) {
- ParallelData *Data = new ParallelData(codeptr_ra);
+ ParallelData *Data = ParallelData::New(codeptr_ra);
parallel_data->ptr = Data;
TsanHappensBefore(Data->GetParallelPtr());
@@ -509,7 +584,7 @@
TsanHappensAfter(Data->GetBarrierPtr(0));
TsanHappensAfter(Data->GetBarrierPtr(1));
- delete Data;
+ Data->Delete();
#if (LLVM_VERSION >= 40)
if (&__archer_get_omp_status) {
@@ -527,19 +602,24 @@
switch (endpoint) {
case ompt_scope_begin:
if (type & ompt_task_initial) {
- parallel_data->ptr = new ParallelData(nullptr);
+ parallel_data->ptr = ParallelData::New(nullptr);
}
- task_data->ptr = new TaskData(ToParallelData(parallel_data), type);
+ task_data->ptr = TaskData::New(ToParallelData(parallel_data), type);
TsanHappensAfter(ToParallelData(parallel_data)->GetParallelPtr());
TsanFuncEntry(ToParallelData(parallel_data)->codePtr);
break;
case ompt_scope_end: {
TaskData *Data = ToTaskData(task_data);
+#ifdef DEBUG
assert(Data->freed == 0 && "Implicit task end should only be called once!");
Data->freed = 1;
+#endif
assert(Data->RefCount == 1 &&
"All tasks should have finished at the implicit barrier!");
- delete Data;
+ Data->Delete();
+ if (type & ompt_task_initial) {
+ ToParallelData(parallel_data)->Delete();
+ }
TsanFuncExit();
break;
}
@@ -588,7 +668,7 @@
break;
case ompt_sync_region_taskgroup:
- Data->TaskGroup = new Taskgroup(Data->TaskGroup);
+ Data->TaskGroup = Taskgroup::New(Data->TaskGroup);
break;
case ompt_sync_region_reduction:
@@ -643,7 +723,7 @@
// Delete this allocated taskgroup, all descendent task are finished by
// now.
Taskgroup *Parent = Data->TaskGroup->Parent;
- delete Data->TaskGroup;
+ Data->TaskGroup->Delete();
Data->TaskGroup = Parent;
break;
}
@@ -705,16 +785,16 @@
ompt_data_t *parallel_data;
int team_size = 1;
ompt_get_parallel_info(0, ¶llel_data, &team_size);
- ParallelData *PData = new ParallelData(nullptr);
+ ParallelData *PData = ParallelData::New(nullptr);
parallel_data->ptr = PData;
- Data = new TaskData(PData, type);
+ Data = TaskData::New(PData, type);
new_task_data->ptr = Data;
} else if (type & ompt_task_undeferred) {
- Data = new TaskData(ToTaskData(parent_task_data), type);
+ Data = TaskData::New(ToTaskData(parent_task_data), type);
new_task_data->ptr = Data;
} else if (type & ompt_task_explicit || type & ompt_task_target) {
- Data = new TaskData(ToTaskData(parent_task_data), type);
+ Data = TaskData::New(ToTaskData(parent_task_data), type);
new_task_data->ptr = Data;
// Use the newly created address. We cannot use a single address from the
@@ -731,7 +811,7 @@
if (task->DependencyCount > 0) {
delete[] task->Dependencies;
}
- delete task;
+ task->Delete();
task = Parent;
}
}
@@ -1006,6 +1086,8 @@
return NULL;
}
+ pagesize = getpagesize();
+
static ompt_start_tool_result_t ompt_start_tool_result = {
&ompt_tsan_initialize, &ompt_tsan_finalize, {0}};
diff --git a/openmp/tools/archer/tests/lit.cfg b/openmp/tools/archer/tests/lit.cfg
--- a/openmp/tools/archer/tests/lit.cfg
+++ b/openmp/tools/archer/tests/lit.cfg
@@ -90,9 +90,12 @@
if 'INTEL_LICENSE_FILE' in os.environ:
config.environment['INTEL_LICENSE_FILE'] = os.environ['INTEL_LICENSE_FILE']
+config.environment['ARCHER_OPTIONS'] = "report_data_leak=1"
+
# Race Tests
config.substitutions.append(("%libarcher-compile-and-run-race-noserial", \
- "%libarcher-compile && env ARCHER_OPTIONS=ignore_serial=1 %libarcher-run-race"))
+ "%%libarcher-compile && env ARCHER_OPTIONS=\"ignore_serial=1 %s\" %%libarcher-run-race" \
+ % config.environment['ARCHER_OPTIONS']))
config.substitutions.append(("%libarcher-compile-and-run-race", \
"%libarcher-compile && %libarcher-run-race"))
config.substitutions.append(("%libarcher-compile-and-run-nosuppression", \