1 files changed, 904 insertions, 0 deletions
diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp
new file mode 100644
index 00000000000..552edf3af32
--- /dev/null
+++ b/openmp/tools/archer/ompt-tsan.cpp
@@ -0,0 +1,904 @@
+/*
+ * ompt-tsan.cpp -- Archer runtime library, TSan annotations for Archer
+ */
+  
+  //===----------------------------------------------------------------------===//
+  //
+  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+  // See https://llvm.org/LICENSE.txt for details.
+  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  //
+  //===----------------------------------------------------------------------===//
+
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <inttypes.h>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+#include <stack>
+#include <list>
+#include <string>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+#if (defined __APPLE__ && defined __MACH__)
+#include <dlfcn.h>
+#endif
+
+#include <sys/resource.h>
+#include "omp-tools.h"
+
+static int runOnTsan;
+static int hasReductionCallback;
+
+class ArcherFlags {
+public:
+#if (LLVM_VERSION) >= 40
+  int flush_shadow;
+#endif
+  int print_max_rss;
+  int verbose;
+  int enabled;
+
+  ArcherFlags(const char *env)
+      :
+#if (LLVM_VERSION) >= 40
+        flush_shadow(0),
+#endif
+        print_max_rss(0), verbose(0), enabled(1) {
+    if (env) {
+      std::vector<std::string> tokens;
+      std::string token;
+      std::string str(env);
+      std::istringstream iss(str);
+      while (std::getline(iss, token, ' '))
+        tokens.push_back(token);
+
+      for (std::vector<std::string>::iterator it = tokens.begin();
+           it != tokens.end(); ++it) {
+#if (LLVM_VERSION) >= 40
+        if (sscanf(it->c_str(), "flush_shadow=%d", &flush_shadow))
+          continue;
+#endif
+        if (sscanf(it->c_str(), "print_max_rss=%d", &print_max_rss))
+          continue;
+        if (sscanf(it->c_str(), "verbose=%d", &verbose))
+          continue;
+        if (sscanf(it->c_str(), "enable=%d", &enabled))
+          continue;
+        std::cerr << "Illegal values for ARCHER_OPTIONS variable: " << token
+                  << std::endl;
+      }
+    }
+  }
+};
+
+#if (LLVM_VERSION) >= 40
+extern "C" {
+int __attribute__((weak)) __archer_get_omp_status();
+void __attribute__((weak)) __tsan_flush_memory() {}
+}
+#endif
+ArcherFlags *archer_flags;
+
+// The following definitions are pasted from "llvm/Support/Compiler.h" to allow
+// the code
+// to be compiled with other compilers like gcc:
+
+#ifndef TsanHappensBefore
+// Thread Sanitizer is a tool that finds races in code.
+// See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
+// tsan detects these exact functions by name.
+extern "C" {
+#if (defined __APPLE__ && defined __MACH__)
+static void AnnotateHappensAfter(const char *file, int line,
+                                 const volatile void *cv) {
+  void (*fptr)(const char *, int, const volatile void *);
+
+  fptr = (void (*)(const char *, int, const volatile void *))dlsym(
+      RTLD_DEFAULT, "AnnotateHappensAfter");
+  (*fptr)(file, line, cv);
+}
+static void AnnotateHappensBefore(const char *file, int line,
+                                  const volatile void *cv) {
+  void (*fptr)(const char *, int, const volatile void *);
+
+  fptr = (void (*)(const char *, int, const volatile void *))dlsym(
+      RTLD_DEFAULT, "AnnotateHappensBefore");
+  (*fptr)(file, line, cv);
+}
+static void AnnotateIgnoreWritesBegin(const char *file, int line) {
+  void (*fptr)(const char *, int);
+
+  fptr = (void (*)(const char *, int))dlsym(RTLD_DEFAULT,
+                                            "AnnotateIgnoreWritesBegin");
+  (*fptr)(file, line);
+}
+static void AnnotateIgnoreWritesEnd(const char *file, int line) {
+  void (*fptr)(const char *, int);
+
+  fptr = (void (*)(const char *, int))dlsym(RTLD_DEFAULT,
+                                            "AnnotateIgnoreWritesEnd");
+  (*fptr)(file, line);
+}
+static void AnnotateNewMemory(const char *file, int line,
+                              const volatile void *cv, size_t size) {
+  void (*fptr)(const char *, int, const volatile void *, size_t);
+
+  fptr = (void (*)(const char *, int, const volatile void *, size_t))dlsym(
+      RTLD_DEFAULT, "AnnotateNewMemory");
+  (*fptr)(file, line, cv, size);
+}
+static int RunningOnValgrind() {
+  int (*fptr)();
+
+  fptr = (int (*)())dlsym(RTLD_DEFAULT, "RunningOnValgrind");
+  if (fptr && fptr != RunningOnValgrind)
+    runOnTsan = 0;
+  return 0;
+}
+#else
+void __attribute__((weak))
+AnnotateHappensAfter(const char *file, int line, const volatile void *cv) {}
+void __attribute__((weak))
+AnnotateHappensBefore(const char *file, int line, const volatile void *cv) {}
+void __attribute__((weak))
+AnnotateIgnoreWritesBegin(const char *file, int line) {}
+void __attribute__((weak)) AnnotateIgnoreWritesEnd(const char *file, int line) {
+}
+void __attribute__((weak))
+AnnotateNewMemory(const char *file, int line, const volatile void *cv,
+                  size_t size) {}
+int __attribute__((weak)) RunningOnValgrind() {
+  runOnTsan = 0;
+  return 0;
+}
+#endif
+}
+
+// This marker is used to define a happens-before arc. The race detector will
+// infer an arc from the begin to the end when they share the same pointer
+// argument.
+#define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv)
+
+// This marker defines the destination of a happens-before arc.
+#define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv)
+
+// Ignore any races on writes between here and the next TsanIgnoreWritesEnd.
+#define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
+
+// Resume checking for racy writes.
+#define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
+
+// We don't really delete the clock for now
+#define TsanDeleteClock(cv)
+
+// newMemory
+#define TsanNewMemory(addr, size)                                              \
+  AnnotateNewMemory(__FILE__, __LINE__, addr, size)
+#define TsanFreeMemory(addr, size)                                             \
+  AnnotateNewMemory(__FILE__, __LINE__, addr, size)
+#endif
+
+/// Required OMPT inquiry functions.
+static ompt_get_parallel_info_t ompt_get_parallel_info;
+static ompt_get_thread_data_t ompt_get_thread_data;
+
+typedef uint64_t ompt_tsan_clockid;
+
+static uint64_t my_next_id() {
+  static uint64_t ID = 0;
+  uint64_t ret = __sync_fetch_and_add(&ID, 1);
+  return ret;
+}
+
+// Data structure to provide a threadsafe pool of reusable objects.
+// DataPool<Type of objects, Size of blockalloc>
+template <typename T, int N> struct DataPool {
+  std::mutex DPMutex;
+  std::stack<T *> DataPointer;
+  std::list<void *> memory;
+  int total;
+
+  void newDatas() {
+    // prefix the Data with a pointer to 'this', allows to return memory to
+    // 'this',
+    // without explicitly knowing the source.
+    //
+    // To reduce lock contention, we use thread local DataPools, but Data
+    // objects move to other threads.
+    // The strategy is to get objects from local pool. Only if the object moved
+    // to another
+    // thread, we might see a penalty on release (returnData).
+    // For "single producer" pattern, a single thread creates tasks, these are
+    // executed by other threads.
+    // The master will have a high demand on TaskData, so return after use.
+    struct pooldata {
+      DataPool<T, N> *dp;
+      T data;
+    };
+    // We alloc without initialize the memory. We cannot call constructors.
+    // Therfore use malloc!
+    pooldata *datas = (pooldata *)malloc(sizeof(pooldata) * N);
+    memory.push_back(datas);
+    for (int i = 0; i < N; i++) {
+      datas[i].dp = this;
+      DataPointer.push(&(datas[i].data));
+    }
+    total += N;
+  }
+
+  T *getData() {
+    T *ret;
+    DPMutex.lock();
+    if (DataPointer.empty())
+      newDatas();
+    ret = DataPointer.top();
+    DataPointer.pop();
+    DPMutex.unlock();
+    return ret;
+  }
+
+  void returnData(T *data) {
+    DPMutex.lock();
+    DataPointer.push(data);
+    DPMutex.unlock();
+  }
+
+  void getDatas(int n, T **datas) {
+    DPMutex.lock();
+    for (int i = 0; i < n; i++) {
+      if (DataPointer.empty())
+        newDatas();
+      datas[i] = DataPointer.top();
+      DataPointer.pop();
+    }
+    DPMutex.unlock();
+  }
+
+  void returnDatas(int n, T **datas) {
+    DPMutex.lock();
+    for (int i = 0; i < n; i++) {
+      DataPointer.push(datas[i]);
+    }
+    DPMutex.unlock();
+  }
+
+  DataPool() : DPMutex(), DataPointer(), total(0) {}
+
+  ~DataPool() {
+    // we assume all memory is returned when the thread finished / destructor is
+    // called
+    for (auto i : memory)
+      if (i)
+        free(i);
+  }
+};
+
+// This function takes care to return the data to the originating DataPool
+// A pointer to the originating DataPool is stored just before the actual data.
+template <typename T, int N> static void retData(void *data) {
+  ((DataPool<T, N> **)data)[-1]->returnData((T *)data);
+}
+
+struct ParallelData;
+__thread DataPool<ParallelData, 4> *pdp;
+
+/// Data structure to store additional information for parallel regions.
+struct ParallelData {
+
+  // Parallel fork is just another barrier, use Barrier[1]
+
+  /// Two addresses for relationships with barriers.
+  ompt_tsan_clockid Barrier[2];
+
+  void *GetParallelPtr() { return &(Barrier[1]); }
+
+  void *GetBarrierPtr(unsigned Index) { return &(Barrier[Index]); }
+
+  ~ParallelData() {
+    TsanDeleteClock(&(Barrier[0]));
+    TsanDeleteClock(&(Barrier[1]));
+  }
+  // overload new/delete to use DataPool for memory management.
+  void *operator new(size_t size) { return pdp->getData(); }
+  void operator delete(void *p, size_t) { retData<ParallelData, 4>(p); }
+};
+
+static inline ParallelData *ToParallelData(ompt_data_t *parallel_data) {
+  return reinterpret_cast<ParallelData *>(parallel_data->ptr);
+}
+
+struct Taskgroup;
+__thread DataPool<Taskgroup, 4> *tgp;
+
+/// Data structure to support stacking of taskgroups and allow synchronization.
+struct Taskgroup {
+  /// Its address is used for relationships of the taskgroup's task set.
+  ompt_tsan_clockid Ptr;
+
+  /// Reference to the parent taskgroup.
+  Taskgroup *Parent;
+
+  Taskgroup(Taskgroup *Parent) : Parent(Parent) {}
+  ~Taskgroup() { TsanDeleteClock(&Ptr); }
+
+  void *GetPtr() { return &Ptr; }
+  // overload new/delete to use DataPool for memory management.
+  void *operator new(size_t size) { return tgp->getData(); }
+  void operator delete(void *p, size_t) { retData<Taskgroup, 4>(p); }
+};
+
+struct TaskData;
+__thread DataPool<TaskData, 4> *tdp;
+
+/// Data structure to store additional information for tasks.
+struct TaskData {
+  /// Its address is used for relationships of this task.
+  ompt_tsan_clockid Task;
+
+  /// Child tasks use its address to declare a relationship to a taskwait in
+  /// this task.
+  ompt_tsan_clockid Taskwait;
+
+  /// Whether this task is currently executing a barrier.
+  bool InBarrier;
+
+  /// Whether this task is an included task.
+  bool Included;
+
+  /// Index of which barrier to use next.
+  char BarrierIndex;
+
+  /// Count how often this structure has been put into child tasks + 1.
+  std::atomic_int RefCount;
+
+  /// Reference to the parent that created this task.
+  TaskData *Parent;
+
+  /// Reference to the implicit task in the stack above this task.
+  TaskData *ImplicitTask;
+
+  /// Reference to the team of this task.
+  ParallelData *Team;
+
+  /// Reference to the current taskgroup that this task either belongs to or
+  /// that it just created.
+  Taskgroup *TaskGroup;
+
+  /// Dependency information for this task.
+  ompt_dependence_t *Dependencies;
+
+  /// Number of dependency entries.
+  unsigned DependencyCount;
+
+  void *PrivateData;
+  size_t PrivateDataSize;
+
+  int execution;
+  int freed;
+
+  TaskData(TaskData *Parent)
+      : InBarrier(false), Included(false), BarrierIndex(0), RefCount(1),
+        Parent(Parent), ImplicitTask(nullptr), Team(Parent->Team),
+        TaskGroup(nullptr), DependencyCount(0), execution(0), freed(0) {
+    if (Parent != nullptr) {
+      Parent->RefCount++;
+      // Copy over pointer to taskgroup. This task may set up its own stack
+      // but for now belongs to its parent's taskgroup.
+      TaskGroup = Parent->TaskGroup;
+    }
+  }
+
+  TaskData(ParallelData *Team = nullptr)
+      : InBarrier(false), Included(false), BarrierIndex(0), RefCount(1),
+        Parent(nullptr), ImplicitTask(this), Team(Team), TaskGroup(nullptr),
+        DependencyCount(0), execution(1), freed(0) {}
+
+  ~TaskData() {
+    TsanDeleteClock(&Task);
+    TsanDeleteClock(&Taskwait);
+  }
+
+  void *GetTaskPtr() { return &Task; }
+
+  void *GetTaskwaitPtr() { return &Taskwait; }
+  // overload new/delete to use DataPool for memory management.
+  void *operator new(size_t size) { return tdp->getData(); }
+  void operator delete(void *p, size_t) { retData<TaskData, 4>(p); }
+};
+
+static inline TaskData *ToTaskData(ompt_data_t *task_data) {
+  return reinterpret_cast<TaskData *>(task_data->ptr);
+}
+
+static inline void *ToInAddr(void *OutAddr) {
+  // FIXME: This will give false negatives when a second variable lays directly
+  //        behind a variable that only has a width of 1 byte.
+  //        Another approach would be to "negate" the address or to flip the
+  //        first bit...
+  return reinterpret_cast<char *>(OutAddr) + 1;
+}
+
+/// Store a mutex for each wait_id to resolve race condition with callbacks.
+std::unordered_map<ompt_wait_id_t, std::mutex> Locks;
+std::mutex LocksMutex;
+
+static void ompt_tsan_thread_begin(ompt_thread_t thread_type,
+                                   ompt_data_t *thread_data) {
+  pdp = new DataPool<ParallelData, 4>;
+  TsanNewMemory(pdp, sizeof(pdp));
+  tgp = new DataPool<Taskgroup, 4>;
+  TsanNewMemory(tgp, sizeof(tgp));
+  tdp = new DataPool<TaskData, 4>;
+  TsanNewMemory(tdp, sizeof(tdp));
+  thread_data->value = my_next_id();
+}
+
+static void ompt_tsan_thread_end(ompt_data_t *thread_data) {
+  delete pdp;
+  delete tgp;
+  delete tdp;
+}
+
+/// OMPT event callbacks for handling parallel regions.
+
+static void ompt_tsan_parallel_begin(ompt_data_t *parent_task_data,
+                                     const ompt_frame_t *parent_task_frame,
+                                     ompt_data_t *parallel_data,
+                                     uint32_t requested_team_size,
+                                     int flag,
+                                     const void *codeptr_ra) {
+  ParallelData *Data = new ParallelData;
+  parallel_data->ptr = Data;
+
+  TsanHappensBefore(Data->GetParallelPtr());
+}
+
+static void ompt_tsan_parallel_end(ompt_data_t *parallel_data,
+                                   ompt_data_t *task_data,
+                                   int flag,
+                                   const void *codeptr_ra) {
+  ParallelData *Data = ToParallelData(parallel_data);
+  TsanHappensAfter(Data->GetBarrierPtr(0));
+  TsanHappensAfter(Data->GetBarrierPtr(1));
+
+  delete Data;
+
+#if (LLVM_VERSION >= 40)
+  if (&__archer_get_omp_status) {
+    if (__archer_get_omp_status() == 0 && archer_flags->flush_shadow)
+      __tsan_flush_memory();
+  }
+#endif
+
+}
+
+static void ompt_tsan_implicit_task(ompt_scope_endpoint_t endpoint,
+                                    ompt_data_t *parallel_data,
+                                    ompt_data_t *task_data,
+                                    unsigned int team_size,
+                                    unsigned int thread_num,
+                                    int type) {
+  switch (endpoint) {
+  case ompt_scope_begin:
+    task_data->ptr = new TaskData(ToParallelData(parallel_data));
+    TsanHappensAfter(ToParallelData(parallel_data)->GetParallelPtr());
+    break;
+  case ompt_scope_end:
+    TaskData *Data = ToTaskData(task_data);
+    assert(Data->freed == 0 && "Implicit task end should only be called once!");
+    Data->freed = 1;
+    assert(Data->RefCount == 1 &&
+           "All tasks should have finished at the implicit barrier!");
+    delete Data;
+    break;
+  }
+}
+
+static void ompt_tsan_sync_region(ompt_sync_region_t kind,
+                                  ompt_scope_endpoint_t endpoint,
+                                  ompt_data_t *parallel_data,
+                                  ompt_data_t *task_data,
+                                  const void *codeptr_ra) {
+  TaskData *Data = ToTaskData(task_data);
+  switch (endpoint) {
+  case ompt_scope_begin:
+    switch (kind) {
+      case ompt_sync_region_barrier_implementation:
+      case ompt_sync_region_barrier_implicit:
+      case ompt_sync_region_barrier_explicit:
+      case ompt_sync_region_barrier: {
+        char BarrierIndex = Data->BarrierIndex;
+        TsanHappensBefore(Data->Team->GetBarrierPtr(BarrierIndex));
+
+        if (hasReductionCallback < ompt_set_always) {
+          // We ignore writes inside the barrier. These would either occur during
+          // 1. reductions performed by the runtime which are guaranteed to be
+          // race-free.
+          // 2. execution of another task.
+          // For the latter case we will re-enable tracking in task_switch.
+          Data->InBarrier = true;
+          TsanIgnoreWritesBegin();
+        }
+
+        break;
+      }
+
+      case ompt_sync_region_taskwait:
+        break;
+
+      case ompt_sync_region_taskgroup:
+        Data->TaskGroup = new Taskgroup(Data->TaskGroup);
+        break;
+
+      default:
+        break;
+    }
+    break;
+  case ompt_scope_end:
+    switch (kind) {
+      case ompt_sync_region_barrier_implementation:
+      case ompt_sync_region_barrier_implicit:
+      case ompt_sync_region_barrier_explicit:
+      case ompt_sync_region_barrier: {
+        if (hasReductionCallback < ompt_set_always) {
+          // We want to track writes after the barrier again.
+          Data->InBarrier = false;
+          TsanIgnoreWritesEnd();
+        }
+
+        char BarrierIndex = Data->BarrierIndex;
+        // Barrier will end after it has been entered by all threads.
+        if (parallel_data)
+          TsanHappensAfter(Data->Team->GetBarrierPtr(BarrierIndex));
+
+        // It is not guaranteed that all threads have exited this barrier before
+        // we enter the next one. So we will use a different address.
+        // We are however guaranteed that this current barrier is finished
+        // by the time we exit the next one. So we can then reuse the first
+        // address.
+        Data->BarrierIndex = (BarrierIndex + 1) % 2;
+        break;
+      }
+
+      case ompt_sync_region_taskwait: {
+        if (Data->execution > 1)
+          TsanHappensAfter(Data->GetTaskwaitPtr());
+        break;
+      }
+
+      case ompt_sync_region_taskgroup: {
+        assert(Data->TaskGroup != nullptr &&
+               "Should have at least one taskgroup!");
+
+        TsanHappensAfter(Data->TaskGroup->GetPtr());
+
+        // Delete this allocated taskgroup, all descendent task are finished by
+        // now.
+        Taskgroup *Parent = Data->TaskGroup->Parent;
+        delete Data->TaskGroup;
+        Data->TaskGroup = Parent;
+        break;
+      }
+
+      default:
+        break;
+    }
+    break;
+  }
+}
+
+static void ompt_tsan_reduction(ompt_sync_region_t kind,
+                                ompt_scope_endpoint_t endpoint,
+                                ompt_data_t *parallel_data,
+                                ompt_data_t *task_data,
+                                const void *codeptr_ra) {
+  switch (endpoint) {
+  case ompt_scope_begin:
+    switch (kind) {
+      case ompt_sync_region_reduction:
+        TsanIgnoreWritesBegin();
+        break;
+      default:
+        break;
+    }
+    break;
+  case ompt_scope_end:
+    switch (kind) {
+      case ompt_sync_region_reduction:
+        TsanIgnoreWritesEnd();
+        break;
+      default:
+        break;
+    }
+    break;
+  }
+}
+
+/// OMPT event callbacks for handling tasks.
+
+static void ompt_tsan_task_create(
+    ompt_data_t *parent_task_data, /* id of parent task            */
+    const ompt_frame_t *parent_frame, /* frame data for parent task   */
+    ompt_data_t *new_task_data, /* id of created task           */
+    int type, int has_dependences,
+    const void *codeptr_ra) /* pointer to outlined function */
+{
+  TaskData *Data;
+  assert(new_task_data->ptr == NULL &&
+         "Task data should be initialized to NULL");
+  if (type & ompt_task_initial) {
+    ompt_data_t *parallel_data;
+    int team_size = 1;
+    ompt_get_parallel_info(0, &parallel_data, &team_size);
+    ParallelData *PData = new ParallelData;
+    parallel_data->ptr = PData;
+
+    Data = new TaskData(PData);
+    new_task_data->ptr = Data;
+  } else if (type & ompt_task_undeferred) {
+    Data = new TaskData(ToTaskData(parent_task_data));
+    new_task_data->ptr = Data;
+    Data->Included = true;
+  } else if (type & ompt_task_explicit || type & ompt_task_target) {
+    Data = new TaskData(ToTaskData(parent_task_data));
+    new_task_data->ptr = Data;
+
+    // Use the newly created address. We cannot use a single address from the
+    // parent because that would declare wrong relationships with other
+    // sibling tasks that may be created before this task is started!
+    TsanHappensBefore(Data->GetTaskPtr());
+    ToTaskData(parent_task_data)->execution++;
+  }
+}
+
+static void ompt_tsan_task_schedule(ompt_data_t *first_task_data,
+                                    ompt_task_status_t prior_task_status,
+                                    ompt_data_t *second_task_data) {
+  TaskData *FromTask = ToTaskData(first_task_data);
+  TaskData *ToTask = ToTaskData(second_task_data);
+
+  if (ToTask->Included && prior_task_status != ompt_task_complete)
+    return; // No further synchronization for begin included tasks
+  if (FromTask->Included && prior_task_status == ompt_task_complete) {
+    // Just delete the task:
+    while (FromTask != nullptr && --FromTask->RefCount == 0) {
+      TaskData *Parent = FromTask->Parent;
+      if (FromTask->DependencyCount > 0) {
+        delete[] FromTask->Dependencies;
+      }
+      delete FromTask;
+      FromTask = Parent;
+    }
+    return;
+  }
+
+  if (ToTask->execution == 0) {
+    ToTask->execution++;
+    // 1. Task will begin execution after it has been created.
+    TsanHappensAfter(ToTask->GetTaskPtr());
+    for (unsigned i = 0; i < ToTask->DependencyCount; i++) {
+      ompt_dependence_t *Dependency = &ToTask->Dependencies[i];
+
+      TsanHappensAfter(Dependency->variable.ptr);
+      // in and inout dependencies are also blocked by prior in dependencies!
+      if (Dependency->dependence_type == ompt_dependence_type_out || Dependency->dependence_type == ompt_dependence_type_inout) {
+        TsanHappensAfter(ToInAddr(Dependency->variable.ptr));
+      }
+    }
+  } else {
+    // 2. Task will resume after it has been switched away.
+    TsanHappensAfter(ToTask->GetTaskPtr());
+  }
+
+  if (prior_task_status != ompt_task_complete) {
+    ToTask->ImplicitTask = FromTask->ImplicitTask;
+    assert(ToTask->ImplicitTask != NULL &&
+           "A task belongs to a team and has an implicit task on the stack");
+  }
+
+  // Task may be resumed at a later point in time.
+  TsanHappensBefore(FromTask->GetTaskPtr());
+
+  if (hasReductionCallback < ompt_set_always && FromTask->InBarrier) {
+    // We want to ignore writes in the runtime code during barriers,
+    // but not when executing tasks with user code!
+    TsanIgnoreWritesEnd();
+  }
+
+  if (prior_task_status == ompt_task_complete) { // task finished
+
+    // Task will finish before a barrier in the surrounding parallel region ...
+    ParallelData *PData = FromTask->Team;
+    TsanHappensBefore(
+        PData->GetBarrierPtr(FromTask->ImplicitTask->BarrierIndex));
+
+    // ... and before an eventual taskwait by the parent thread.
+    TsanHappensBefore(FromTask->Parent->GetTaskwaitPtr());
+
+    if (FromTask->TaskGroup != nullptr) {
+      // This task is part of a taskgroup, so it will finish before the
+      // corresponding taskgroup_end.
+      TsanHappensBefore(FromTask->TaskGroup->GetPtr());
+    }
+    for (unsigned i = 0; i < FromTask->DependencyCount; i++) {
+      ompt_dependence_t *Dependency = &FromTask->Dependencies[i];
+
+      // in dependencies block following inout and out dependencies!
+      TsanHappensBefore(ToInAddr(Dependency->variable.ptr));
+      if (Dependency->dependence_type == ompt_dependence_type_out || Dependency->dependence_type == ompt_dependence_type_inout) {
+        TsanHappensBefore(Dependency->variable.ptr);
+      }
+    }
+    while (FromTask != nullptr && --FromTask->RefCount == 0) {
+      TaskData *Parent = FromTask->Parent;
+      if (FromTask->DependencyCount > 0) {
+        delete[] FromTask->Dependencies;
+      }
+      delete FromTask;
+      FromTask = Parent;
+    }
+  }
+  if (hasReductionCallback < ompt_set_always && ToTask->InBarrier) {
+    // We re-enter runtime code which currently performs a barrier.
+    TsanIgnoreWritesBegin();
+  }
+}
+
+static void ompt_tsan_dependences(ompt_data_t *task_data,
+                                  const ompt_dependence_t *deps,
+                                  int ndeps) {
+  if (ndeps > 0) {
+    // Copy the data to use it in task_switch and task_end.
+    TaskData *Data = ToTaskData(task_data);
+    Data->Dependencies = new ompt_dependence_t[ndeps];
+    std::memcpy(Data->Dependencies, deps,
+                sizeof(ompt_dependence_t) * ndeps);
+    Data->DependencyCount = ndeps;
+
+    // This callback is executed before this task is first started.
+    TsanHappensBefore(Data->GetTaskPtr());
+  }
+}
+
+/// OMPT event callbacks for handling locking.
+static void ompt_tsan_mutex_acquired(ompt_mutex_t kind,
+                                     ompt_wait_id_t wait_id,
+                                     const void *codeptr_ra) {
+
+  // Acquire our own lock to make sure that
+  // 1. the previous release has finished.
+  // 2. the next acquire doesn't start before we have finished our release.
+    LocksMutex.lock();
+    std::mutex &Lock = Locks[wait_id];
+    LocksMutex.unlock();
+
+    Lock.lock();
+    TsanHappensAfter(&Lock);
+}
+
+static void ompt_tsan_mutex_released(ompt_mutex_t kind,
+                                     ompt_wait_id_t wait_id,
+                                     const void *codeptr_ra) {
+    LocksMutex.lock();
+    std::mutex &Lock = Locks[wait_id];
+    LocksMutex.unlock();
+    TsanHappensBefore(&Lock);
+
+    Lock.unlock();
+}
+
+// callback , signature , variable to store result , required support level
+#define SET_OPTIONAL_CALLBACK_T(event, type, result, level)                             \
+  do {                                                                                  \
+    ompt_callback_##type##_t tsan_##event = &ompt_tsan_##event;                         \
+    result = ompt_set_callback(ompt_callback_##event,                                   \
+                                (ompt_callback_t)tsan_##event);                         \
+    if (result < level)                                                                 \
+      printf("Registered callback '" #event "' is not supported at " #level " (%i)\n",  \
+             result);                                                                   \
+  } while (0)
+
+#define SET_CALLBACK_T(event, type)                              \
+  do {                                                           \
+    int res;                                                     \
+    SET_OPTIONAL_CALLBACK_T(event, type, res, ompt_set_always);  \
+  } while (0)
+
+#define SET_CALLBACK(event) SET_CALLBACK_T(event, event)
+
+static int ompt_tsan_initialize(ompt_function_lookup_t lookup,
+                                int device_num,
+                                ompt_data_t *tool_data) {
+  const char *options = getenv("ARCHER_OPTIONS");
+  archer_flags = new ArcherFlags(options);
+
+  ompt_set_callback_t ompt_set_callback =
+      (ompt_set_callback_t)lookup("ompt_set_callback");
+  if (ompt_set_callback == NULL) {
+    std::cerr << "Could not set callback, exiting..." << std::endl;
+    std::exit(1);
+  }
+  ompt_get_parallel_info =
+      (ompt_get_parallel_info_t)lookup("ompt_get_parallel_info");
+  ompt_get_thread_data = (ompt_get_thread_data_t)lookup("ompt_get_thread_data");
+
+  if (ompt_get_parallel_info == NULL) {
+    fprintf(stderr, "Could not get inquiry function 'ompt_get_parallel_info', "
+                    "exiting...\n");
+    exit(1);
+  }
+
+  SET_CALLBACK(thread_begin);
+  SET_CALLBACK(thread_end);
+  SET_CALLBACK(parallel_begin);
+  SET_CALLBACK(implicit_task);
+  SET_CALLBACK(sync_region);
+  SET_CALLBACK(parallel_end);
+
+  SET_CALLBACK(task_create);
+  SET_CALLBACK(task_schedule);
+  SET_CALLBACK(dependences);
+
+  SET_CALLBACK_T(mutex_acquired, mutex);
+  SET_CALLBACK_T(mutex_released, mutex);
+  SET_OPTIONAL_CALLBACK_T(reduction, sync_region, hasReductionCallback, ompt_set_never);
+  return 1; // success
+}
+
+static void ompt_tsan_finalize(ompt_data_t *tool_data) {
+  if (archer_flags->print_max_rss) {
+    struct rusage end;
+    getrusage(RUSAGE_SELF, &end);
+    printf("MAX RSS[KBytes] during execution: %ld\n", end.ru_maxrss);
+  }
+
+  if (archer_flags)
+    delete archer_flags;
+}
+
+extern "C"
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+                                          const char *runtime_version) {
+  const char *options = getenv("ARCHER_OPTIONS");
+  archer_flags = new ArcherFlags(options);
+  if (!archer_flags->enabled)
+  {
+    if (archer_flags->verbose)
+      std::cout << "Archer disabled, stopping operation"
+                << std::endl;
+    delete archer_flags;
+    return NULL;
+  }
+  
+  static ompt_start_tool_result_t ompt_start_tool_result = {
+      &ompt_tsan_initialize, &ompt_tsan_finalize, {0}};
+  runOnTsan=1;
+  RunningOnValgrind();
+  if (!runOnTsan) // if we are not running on TSAN, give a different tool the
+    // chance to be loaded
+  {
+    if (archer_flags->verbose)
+      std::cout << "Archer detected OpenMP application without TSan "
+                   "stopping operation"
+                << std::endl;
+    delete archer_flags;
+    return NULL;
+  }
+
+  if (archer_flags->verbose)
+    std::cout << "Archer detected OpenMP application with TSan, supplying "
+                 "OpenMP synchronization semantics"
+              << std::endl;
+  return &ompt_start_tool_result;
+}