Index: runtime/src/kmp_csupport.cpp
===================================================================
--- runtime/src/kmp_csupport.cpp
+++ runtime/src/kmp_csupport.cpp
@@ -3810,12 +3810,17 @@
     // we are the first thread, allocate the array of flags
     kmp_int64 size =
         trace_count / 8 + 8; // in bytes, use single bit per iteration
-    sh_buf->doacross_flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
+    flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
+    KMP_MB();
+    sh_buf->doacross_flags = flags;
   } else if ((kmp_int64)flags == 1) {
     // initialization is still in progress, need to wait
     while ((volatile kmp_int64)sh_buf->doacross_flags == 1) {
       KMP_YIELD(TRUE);
     }
+    KMP_MB();
+  } else {
+    KMP_MB();
   }
   KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags >
                    1); // check value of pointer
@@ -3912,6 +3917,7 @@
   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
     KMP_YIELD(TRUE);
   }
+  KMP_MB();
   KA_TRACE(20,
            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
             gtid, (iter_number << 5) + shft));
@@ -3964,6 +3970,8 @@
   shft = iter_number % 32; // use 32-bit granularity
   iter_number >>= 5; // divided by 32
   flag = 1 << shft;
+  // JH: I think we don't need a memory barrier here because the atomic
+  // operation will act as synchronization.
   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
Index: runtime/test/worksharing/for/kmp_doacross_check.c
===================================================================
--- runtime/test/worksharing/for/kmp_doacross_check.c
+++ runtime/test/worksharing/for/kmp_doacross_check.c
@@ -24,7 +24,7 @@
   dims.lo = 1;
   dims.up = N-1;
   dims.st = 1;
-  #pragma omp parallel
+  #pragma omp parallel num_threads(4)
   {
     int i, gtid;
     long long vec;