Index: lib/tsan/rtl/tsan_clock.h
===================================================================
--- lib/tsan/rtl/tsan_clock.h
+++ lib/tsan/rtl/tsan_clock.h
@@ -15,6 +15,7 @@
 
 #include "tsan_defs.h"
 #include "tsan_dense_alloc.h"
+#include "sanitizer_common/sanitizer_vector.h"
 
 namespace __tsan {
 
@@ -34,6 +35,7 @@
   u64 get_clean(unsigned tid) const;
 
   void Resize(ClockCache *c, uptr nclk);
+  void ResizeReleaseSequenceVectors(unsigned int tid, u16 nclk);
   void Reset(ClockCache *c);
 
   void DebugDump(int(*printf)(const char *s, ...));
@@ -110,6 +112,11 @@
   u16 size_;
   u16 blocks_;  // Number of second level blocks.
 
+  // support several release sequence in one sync variable
+  // see Page 5 in https://www.doc.ic.ac.uk/~afd/homepages/papers/pdfs/2017/POPL.pdf
+  Vector<Vector<u64>> released_threads_clock;
+  Vector<bool> release_sequence_blocked;
+
   void Unshare(ClockCache *c);
   bool IsShared() const;
   bool Cachable() const;
@@ -135,11 +142,18 @@
   uptr size() const;
 
   void acquire(ClockCache *c, SyncClock *src);
+  void relaxed_load(ClockCache *c, SyncClock *src);
   void release(ClockCache *c, SyncClock *dst);
+  void relaxed_store(ClockCache *c, SyncClock *dst);
   void acq_rel(ClockCache *c, SyncClock *dst);
   void ReleaseStore(ClockCache *c, SyncClock *dst);
+  void RelaxedStore(ClockCache *c, SyncClock *dst);
+  void acquire_fence();
+  void release_fence();
   void ResetCached(ClockCache *c);
 
+  void block_release_sequences(SyncClock *dst);
+
   void DebugReset();
   void DebugDump(int(*printf)(const char *s, ...));
 
@@ -163,6 +177,8 @@
   // Number of active elements in the clk_ table (the rest is zeros).
   uptr nclk_;
   u64 clk_[kMaxTidInClock];  // Fixed size vector clock.
+  u64 fence_release_clock[kMaxTidInClock];
+  u64 fence_acquire_clock[kMaxTidInClock];
 
   bool IsAlreadyAcquired(const SyncClock *src) const;
   void UpdateCurrentThread(ClockCache *c, SyncClock *dst) const;
Index: lib/tsan/rtl/tsan_clock.cc
===================================================================
--- lib/tsan/rtl/tsan_clock.cc
+++ lib/tsan/rtl/tsan_clock.cc
@@ -178,6 +178,22 @@
   }
 }
 
+void ThreadClock::relaxed_load(ClockCache *c, SyncClock *src) {
+  DCHECK_LE(nclk_, kMaxTid);
+  DCHECK_LE(src->size_, kMaxTid);
+
+  // Check if it's empty -> no need to do anything.
+  if (src->size_ == 0) {
+    return;
+  }
+
+  uptr i = 0;
+  for (ClockElem &ce : *src) {
+    fence_acquire_clock[i] = max(fence_acquire_clock[i], ce.epoch);
+    ++i;
+  }
+}
+
 void ThreadClock::release(ClockCache *c, SyncClock *dst) {
   DCHECK_LE(nclk_, kMaxTid);
   DCHECK_LE(dst->size_, kMaxTid);
@@ -185,6 +201,9 @@
   if (dst->size_ == 0) {
     // ReleaseStore will correctly set release_store_tid_,
     // which can be important for future operations.
+    // TODO(sobir.yorov94):
+    // maybe failed because RMW doesn't block release sequence,
+    // but in ReleaseStore we are blocked they
     ReleaseStore(c, dst);
     return;
   }
@@ -194,6 +213,11 @@
   if (dst->size_ < nclk_)
     dst->Resize(c, nclk_);
 
+  // Check if we need to resize release release sequence vector for tid
+  if (dst->released_threads_clock.Size() < dst->size_ ||
+          dst->released_threads_clock[tid_].Size() < dst->size_) {
+    dst->ResizeReleaseSequenceVectors(tid_, dst->size_);
+  }
   // Check if we had not acquired anything from other threads
   // since the last release on dst. If so, we need to update
   // only dst->elem(tid_).
@@ -216,6 +240,12 @@
   dst->FlushDirty();
   uptr i = 0;
   for (ClockElem &ce : *dst) {
+    dst->released_threads_clock[tid_][i] = ce.epoch;
+    ++i;
+  }
+  dst->release_sequence_blocked[tid_] = false;
+  i = 0;
+  for (ClockElem &ce : *dst) {
     ce.epoch = max(ce.epoch, clk_[i]);
     ce.reused = 0;
     i++;
@@ -233,9 +263,31 @@
     dst->elem(tid_).reused = reused_;
 }
 
+void ThreadClock::relaxed_store(ClockCache *c, SyncClock *dst) {
+  DCHECK_LE(nclk_, kMaxTid);
+  DCHECK_LE(dst->size_, kMaxTid);
+
+  // Check if we need to resize dst.
+  if (dst->size_ < nclk_)
+    dst->Resize(c, nclk_);
+
+  // TODO(sobir.yorov94@gmail.com):
+  // optimize in case when we didn't change fence_release_clock from last time
+  dst->Unshare(c);
+  uptr i = 0;
+  for (ClockElem &ce : *dst) {
+    ce.epoch = max(ce.epoch, fence_release_clock[i]);
+    ce.reused = 0;
+    i++;
+  }
+  dst->FlushDirty();
+}
+
 void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
   DCHECK_LE(nclk_, kMaxTid);
   DCHECK_LE(dst->size_, kMaxTid);
+  DCHECK_LE(dst->released_threads_clock.Size(), kMaxTid);
+  DCHECK_LE(dst->release_sequence_blocked.Size(), kMaxTid);
   CPP_STAT_INC(StatClockStore);
 
   if (dst->size_ == 0 && cached_idx_ != 0) {
@@ -268,6 +320,19 @@
   if (dst->size_ < nclk_)
     dst->Resize(c, nclk_);
 
+  // Check if we need to resize release sequence vector for tid
+  if (dst->released_threads_clock.Size() < dst->size_ ||
+      dst->released_threads_clock[tid_].Size() < dst->size_) {
+    dst->ResizeReleaseSequenceVectors(tid_, dst->size_);
+  }
+
+  // Since it's release store we block release sequence of other threads
+  block_release_sequences(dst);
+  dst->release_sequence_blocked[tid_] = false;
+  for (uptr i = 0; i < nclk_; ++i) {
+    dst->released_threads_clock[tid_][i] = clk_[i];
+  }
+
   if (dst->release_store_tid_ == tid_ &&
       dst->release_store_reused_ == reused_ &&
       dst->elem(tid_).epoch > last_acquire_) {
@@ -281,6 +346,7 @@
   dst->Unshare(c);
   // Note: dst can be larger than this ThreadClock.
   // This is fine since clk_ beyond size is all zeros.
+
   uptr i = 0;
   for (ClockElem &ce : *dst) {
     ce.epoch = clk_[i];
@@ -309,12 +375,72 @@
   }
 }
 
+void ThreadClock::RelaxedStore(ClockCache *c, SyncClock *dst) {
+  DCHECK_LE(nclk_, kMaxTid);
+  DCHECK_LE(dst->size_, kMaxTid);
+  DCHECK_LE(dst->released_threads_clock.Size(), kMaxTid);
+  DCHECK_LE(dst->release_sequence_blocked.Size(), kMaxTid);
+  // TODO(yorov.sobir): add stat here
+
+  // Check if we need to resize dst.
+  if (dst->size_ < nclk_) {
+    dst->Resize(c, nclk_);
+  }
+  // Check if we need to resize release sequence vector for tid
+  if (dst->released_threads_clock.Size() < dst->size_ ||
+      dst->released_threads_clock[tid_].Size() < dst->size_) {
+    dst->ResizeReleaseSequenceVectors(tid_, dst->size_);
+  }
+  // relaxed store block all release sequences on current atomic
+  block_release_sequences(dst);
+  // unshare before changing dst
+  dst->Unshare(c);
+  if (!dst->release_sequence_blocked[tid_]) {
+    uptr i = 0;
+    for (ClockElem &ce : *dst) {
+      ce.epoch = max(dst->released_threads_clock[tid_][i],
+                     fence_release_clock[i]);
+      ce.reused = 0;
+      i++;
+    }
+  } else {
+    uptr i = 0;
+    for (ClockElem &ce : *dst) {
+      ce.epoch = fence_release_clock[i];
+      ce.reused = 0;
+      i++;
+    }
+  }
+  dst->FlushDirty();
+}
+
 void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
   CPP_STAT_INC(StatClockAcquireRelease);
   acquire(c, dst);
   ReleaseStore(c, dst);
 }
 
+void ThreadClock::acquire_fence() {
+  DCHECK_LE(nclk_, kMaxTid);
+  for (uptr i = 0; i < nclk_; ++i) {
+    clk_[i] = max(clk_[i], fence_acquire_clock[i]);
+  }
+}
+
+void ThreadClock::release_fence() {
+  DCHECK_LE(nclk_, kMaxTid);
+  for (uptr i = 0; i < nclk_; ++i) {
+    fence_release_clock[i] = clk_[i];
+  }
+}
+
+void ThreadClock::block_release_sequences(SyncClock *dst) {
+  for (uptr i = 0; i < nclk_; ++i) {
+    if (i != tid_) {
+      dst->release_sequence_blocked[i] = true;
+    }
+  }
+}
 // Updates only single element related to the current thread in dst->clk_.
 void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const {
   // Update the threads time, but preserve 'acquired' flag.
@@ -573,6 +699,12 @@
       dirty_[1].tid, dirty_[1].epoch);
 }
 
+void SyncClock::ResizeReleaseSequenceVectors(const unsigned int tid, u16 nclk) {
+  released_threads_clock.Resize(nclk);
+  released_threads_clock[tid].Resize(nclk);
+  release_sequence_blocked.Resize(nclk);
+}
+
 void SyncClock::Iter::Next() {
   // Finished with the current block, move on to the next one.
   block_++;
Index: lib/tsan/rtl/tsan_interface_atomic.cc
===================================================================
--- lib/tsan/rtl/tsan_interface_atomic.cc
+++ lib/tsan/rtl/tsan_interface_atomic.cc
@@ -224,16 +224,20 @@
   CHECK(IsLoadOrder(mo));
   // This fast-path is critical for performance.
   // Assume the access is atomic.
-  if (!IsAcquireOrder(mo)) {
-    MemoryReadAtomic(thr, pc, (uptr)a, SizeLog<T>());
-    return NoTsanAtomicLoad(a, mo);
-  }
+//  if (!IsAcquireOrder(mo)) {
+//    MemoryReadAtomic(thr, pc, (uptr)a, SizeLog<T>());
+//    return NoTsanAtomicLoad(a, mo);
+//  }
   // Don't create sync object if it does not exist yet. For example, an atomic
   // pointer is initialized to nullptr and then periodically acquire-loaded.
   T v = NoTsanAtomicLoad(a, mo);
   SyncVar *s = ctx->metamap.GetIfExistsAndLock((uptr)a, false);
   if (s) {
-    AcquireImpl(thr, pc, &s->clock);
+    if (IsAcquireOrder(mo)) {
+      AcquireImpl(thr, pc, &s->clock);
+    } else {
+      RelaxedLoadImpl(thr, pc, &s->clock);
+    }
     // Re-read under sync mutex because we need a consistent snapshot
     // of the value and the clock we acquire.
     v = NoTsanAtomicLoad(a, mo);
@@ -264,16 +268,17 @@
   // Assume the access is atomic.
   // Strictly saying even relaxed store cuts off release sequence,
   // so must reset the clock.
-  if (!IsReleaseOrder(mo)) {
-    NoTsanAtomicStore(a, v, mo);
-    return;
-  }
+
   __sync_synchronize();
   SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true);
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-  ReleaseStoreImpl(thr, pc, &s->clock);
+  if (IsReleaseOrder(mo)) {
+    thr->fast_state.IncrementEpoch();
+    // Can't increment epoch w/o writing to the trace as well.
+    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
+    ReleaseStoreImpl(thr, pc, &s->clock);
+  } else {
+    RelaxedStoreImpl(thr, pc, &s->clock);
+  }
   NoTsanAtomicStore(a, v, mo);
   s->mtx.Unlock();
 }
@@ -281,9 +286,8 @@
 template<typename T, T (*F)(volatile T *v, T op)>
 static T AtomicRMW(ThreadState *thr, uptr pc, volatile T *a, T v, morder mo) {
   MemoryWriteAtomic(thr, pc, (uptr)a, SizeLog<T>());
-  SyncVar *s = 0;
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true);
   if (mo != mo_relaxed) {
-    s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true);
     thr->fast_state.IncrementEpoch();
     // Can't increment epoch w/o writing to the trace as well.
     TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
@@ -293,6 +297,8 @@
       ReleaseImpl(thr, pc, &s->clock);
     else if (IsAcquireOrder(mo))
       AcquireImpl(thr, pc, &s->clock);
+  } else {
+    RelaxedImpl(thr, pc, &s->clock);
   }
   v = F(a, v);
   if (s)
@@ -405,10 +411,9 @@
     volatile T *a, T *c, T v, morder mo, morder fmo) {
   (void)fmo;  // Unused because llvm does not pass it yet.
   MemoryWriteAtomic(thr, pc, (uptr)a, SizeLog<T>());
-  SyncVar *s = 0;
   bool write_lock = mo != mo_acquire && mo != mo_consume;
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, write_lock);
   if (mo != mo_relaxed) {
-    s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, write_lock);
     thr->fast_state.IncrementEpoch();
     // Can't increment epoch w/o writing to the trace as well.
     TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
@@ -418,6 +423,8 @@
       ReleaseImpl(thr, pc, &s->clock);
     else if (IsAcquireOrder(mo))
       AcquireImpl(thr, pc, &s->clock);
+  } else {
+    RelaxedImpl(thr, pc, &s->clock);
   }
   T cc = *c;
   T pr = func_cas(a, cc, v);
@@ -446,8 +453,11 @@
 }
 
 static void AtomicFence(ThreadState *thr, uptr pc, morder mo) {
-  // FIXME(dvyukov): not implemented.
-  __sync_synchronize();
+  if (IsAcquireOrder(mo))
+    AcquireFenceImpl(thr, pc);
+  else if (IsReleaseOrder(mo))
+    ReleaseFenceImpl(thr, pc);
+  NoTsanAtomicFence(mo);
 }
 #endif
 
Index: lib/tsan/rtl/tsan_rtl.h
===================================================================
--- lib/tsan/rtl/tsan_rtl.h
+++ lib/tsan/rtl/tsan_rtl.h
@@ -801,9 +801,14 @@
 void ReleaseStore(ThreadState *thr, uptr pc, uptr addr);
 void AfterSleep(ThreadState *thr, uptr pc);
 void AcquireImpl(ThreadState *thr, uptr pc, SyncClock *c);
+void RelaxedLoadImpl(ThreadState *thr, uptr pc, SyncClock *c);
 void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c);
+void RelaxedImpl(ThreadState *thr, uptr pc, SyncClock *c);
 void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c);
+void RelaxedStoreImpl(ThreadState *thr, uptr pc, SyncClock *c);
 void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c);
+void AcquireFenceImpl(ThreadState *thr, uptr pc);
+void ReleaseFenceImpl(ThreadState *thr, uptr pc);
 
 // The hacky call uses custom calling convention and an assembly thunk.
 // It is considerably faster that a normal call for the caller
Index: lib/tsan/rtl/tsan_rtl_mutex.cc
===================================================================
--- lib/tsan/rtl/tsan_rtl_mutex.cc
+++ lib/tsan/rtl/tsan_rtl_mutex.cc
@@ -483,6 +483,14 @@
   StatInc(thr, StatSyncAcquire);
 }
 
+void RelaxedLoadImpl(ThreadState *thr, uptr pc, SyncClock *c) {
+  if (thr->ignore_sync) {
+    return;
+  }
+  thr->clock.set(thr->fast_state.epoch());
+  thr->clock.relaxed_load(&thr->proc()->clock_cache, c);
+}
+
 void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
   if (thr->ignore_sync)
     return;
@@ -492,6 +500,16 @@
   StatInc(thr, StatSyncRelease);
 }
 
+void RelaxedImpl(ThreadState *thr, uptr pc, SyncClock *c) {
+  if (thr->ignore_sync) {
+    return;
+  }
+  thr->clock.set(thr->fast_state.epoch());
+  thr->fast_synch_epoch = thr->fast_state.epoch();
+  thr->clock.relaxed_store(&thr->proc()->clock_cache, c);
+  // TODO(yorov.sobir): add stat here
+}
+
 void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c) {
   if (thr->ignore_sync)
     return;
@@ -501,6 +519,16 @@
   StatInc(thr, StatSyncRelease);
 }
 
+void RelaxedStoreImpl(ThreadState *thr, uptr pc, SyncClock *c) {
+  if (thr->ignore_sync) {
+    return;
+  }
+  thr->clock.set(thr->fast_state.epoch());
+  thr->fast_synch_epoch = thr->fast_state.epoch();
+  thr->clock.RelaxedStore(&thr->proc()->clock_cache, c);
+  // TODO(yorov.sobir): add stat here
+}
+
 void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
   if (thr->ignore_sync)
     return;
@@ -511,6 +539,26 @@
   StatInc(thr, StatSyncRelease);
 }
 
+void AcquireFenceImpl(ThreadState *thr, uptr pc) {
+  if (thr->ignore_sync) {
+    return;
+  }
+  thr->clock.set(thr->fast_state.epoch());
+  thr->fast_synch_epoch = thr->fast_state.epoch();
+  thr->clock.acquire_fence();
+  //TODO(sobir.yorov94@gmail.com): add stat
+}
+
+void ReleaseFenceImpl(ThreadState *thr, uptr pc) {
+  if (thr->ignore_sync) {
+    return;
+  }
+  thr->clock.set(thr->fast_state.epoch());
+  thr->fast_synch_epoch = thr->fast_state.epoch();
+  thr->clock.release_fence();
+  //TODO(sobir.yorov94@gmail.com): add stat
+}
+
 void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
   if (r == 0)
     return;
Index: test/CMakeLists.txt
===================================================================
--- test/CMakeLists.txt
+++ test/CMakeLists.txt
@@ -15,7 +15,7 @@
 endif()
 
 if(COMPILER_RT_STANDALONE_BUILD)
-  add_executable(FileCheck IMPORTED GLOBAL)
+  add_executable(FileCheck IMPORTED GLOBAL tsan/atomic_release_sequence_blocking.cpp tsan/fence_norace.cpp)
   set_property(TARGET FileCheck PROPERTY IMPORTED_LOCATION ${LLVM_TOOLS_BINARY_DIR}/FileCheck)
   list(APPEND SANITIZER_COMMON_LIT_TEST_DEPS FileCheck)
 endif()
Index: test/tsan/atomic_release_sequence_blocking.cpp
===================================================================
--- /dev/null
+++ test/tsan/atomic_release_sequence_blocking.cpp
@@ -0,0 +1,45 @@
+// RUN: %clangxx_tsan -O0 %s -o %t && %run %t 2>&1 | FileCheck %s
+#include "test.h"
+
+int nax;
+int x;
+
+void* thread1(void* arg) {
+  nax = 1;
+  __atomic_store_n(&x, 1, __ATOMIC_RELEASE);
+  return 0;
+}
+
+void* thread2(void* arg) {
+  if (__atomic_load_n(&x, __ATOMIC_ACQUIRE) == 1) {
+    __atomic_store_n(&x, 2, __ATOMIC_RELAXED);
+  }
+  return 0;
+}
+
+void* thread3(void* arg) {
+  if (__atomic_load_n(&x, __ATOMIC_ACQUIRE) == 2) {
+    int temp = nax;
+    (void)temp;
+  } else {
+    fprintf(stderr, "DONE\n");
+  }
+  return 0;
+}
+
+int main() {
+  pthread_t t1;
+  pthread_t t2;
+  pthread_t t3;
+  pthread_create(&t1, nullptr, thread1, nullptr);
+  pthread_create(&t2, nullptr, thread2, nullptr);
+  pthread_create(&t3, nullptr, thread3, nullptr);
+
+  pthread_join(t1, nullptr);
+  pthread_join(t2, nullptr);
+  pthread_join(t3, nullptr);
+  return 0;
+}
+
+// CHECK: WARNING: ThreadSanitizer: data race
+// CHECK-NOT-EMPTY:
\ No newline at end of file
Index: test/tsan/fence_norace.cpp
===================================================================
--- /dev/null
+++ test/tsan/fence_norace.cpp
@@ -0,0 +1,35 @@
+// RUN: %clangxx_tsan -O0 %s -o %t && %run %t 2>&1 | FileCheck %s
+#include "test.h"
+#include <atomic>
+
+int nax;
+std::atomic<int> x;
+
+void* thread1(void* arg) {
+  nax = 1;
+  std::atomic_thread_fence(std::memory_order_release);
+  x.store(1, std::memory_order_relaxed);
+  return 0;
+}
+
+void* thread2(void* arg) {
+  if (x.load(std::memory_order_relaxed) == 1) {
+    std::atomic_thread_fence(std::memory_order_acquire);
+    printf("%d\n", nax);
+  }
+  return 0;
+}
+
+int main() {
+  pthread_t t1;
+  pthread_t t2;
+  pthread_create(&t1, nullptr, thread1, nullptr);
+  pthread_create(&t2, nullptr, thread2, nullptr);
+
+  pthread_join(t1, nullptr);
+  pthread_join(t2, nullptr);
+  return 0;
+}
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK: 1