Index: openmp/trunk/runtime/src/kmp_csupport.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_csupport.cpp +++ openmp/trunk/runtime/src/kmp_csupport.cpp @@ -3815,7 +3815,9 @@ if (flags == NULL) { // we are the first thread, allocate the array of flags size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration - sh_buf->doacross_flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1); + flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1); + KMP_MB(); + sh_buf->doacross_flags = flags; } else if (flags == (kmp_uint32 *)1) { #if KMP_32_BIT_ARCH // initialization is still in progress, need to wait @@ -3824,6 +3826,9 @@ while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL) #endif KMP_YIELD(TRUE); + KMP_MB(); + } else { + KMP_MB(); } KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value pr_buf->th_doacross_flags = @@ -3919,6 +3924,7 @@ while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) { KMP_YIELD(TRUE); } + KMP_MB(); KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", gtid, (iter_number << 5) + shft)); @@ -3971,6 +3977,7 @@ shft = iter_number % 32; // use 32-bit granularity iter_number >>= 5; // divided by 32 flag = 1 << shft; + KMP_MB(); if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag); KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid, Index: openmp/trunk/runtime/test/worksharing/for/kmp_doacross_check.c =================================================================== --- openmp/trunk/runtime/test/worksharing/for/kmp_doacross_check.c +++ openmp/trunk/runtime/test/worksharing/for/kmp_doacross_check.c @@ -24,7 +24,7 @@ dims.lo = 1; dims.up = N-1; dims.st = 1; - #pragma omp parallel + #pragma omp parallel num_threads(4) { int i, gtid; long long vec;