diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -124,6 +124,10 @@ __builtin_amdgcn_s_barrier(); } +EXTERN void __kmpc_impl_threadfence(void); +EXTERN void __kmpc_impl_threadfence_block(void); +EXTERN void __kmpc_impl_threadfence_system(void); + // DEVICE versions of part of libc extern "C" { DEVICE __attribute__((noreturn)) void diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -363,7 +363,7 @@ __kmpc_barrier(loc, threadId); if (tid == 0) { omptarget_nvptx_threadPrivateContext->Cnt() = 0; - __threadfence_block(); + __kmpc_impl_threadfence_block(); } __kmpc_barrier(loc, threadId); PRINT(LD_LOOP, diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -212,7 +212,7 @@ if (Rank == 0) { parallelLevel[GetWarpId()] += (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); - __threadfence(); + __kmpc_impl_threadfence(); } __kmpc_impl_syncwarp(Mask); } @@ -224,7 +224,7 @@ if (Rank == 0) { parallelLevel[GetWarpId()] -= (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); - __threadfence(); + __kmpc_impl_threadfence(); } __kmpc_impl_syncwarp(Mask); } diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu --- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu @@ -133,7 +133,7 @@ EXTERN void __kmpc_flush(kmp_Ident *loc) { PRINT0(LD_IO, "call kmpc_flush\n"); - __threadfence(); + __kmpc_impl_threadfence(); } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -211,7 +211,7 @@ } // FIXME: Need to see the impact of doing it here. - __threadfence_block(); + __kmpc_impl_threadfence_block(); DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n"); @@ -289,7 +289,7 @@ } // FIXME: Need to see the impact of doing it here. - __threadfence_block(); + __kmpc_impl_threadfence_block(); DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n"); return; @@ -357,7 +357,7 @@ if (GetThreadIdInBlock() == 0) data_sharing_init_stack_common(); - __threadfence_block(); + __kmpc_impl_threadfence_block(); } INLINE static void* data_sharing_push_stack_common(size_t PushSize) { @@ -474,7 +474,7 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - __threadfence_block(); + __kmpc_impl_threadfence_block(); if (GetThreadIdInBlock() % WARPSIZE == 0) { unsigned WID = GetWarpId(); @@ -555,7 +555,7 @@ ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), "Must be called only in the target master thread."); *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); - __threadfence(); + __kmpc_impl_threadfence(); } EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, @@ -569,7 +569,7 @@ } return; } - __threadfence(); + __kmpc_impl_threadfence(); ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), "Must be called only in the target master thread."); omptarget_nvptx_simpleMemoryManager.Release(); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -241,7 +241,7 @@ char *scratchpad = GetTeamsReductionScratchpad(); scratchFct(reduce_data, scratchpad, TeamId, NumTeams); - __threadfence(); + __kmpc_impl_threadfence(); // atomicInc increments 'timestamp' and has a range [0, NumTeams-1]. // It resets 'timestamp' back to 0 once the last team increments @@ -389,7 +389,7 @@ EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid, kmp_CriticalName *crit) { - __threadfence_system(); + __kmpc_impl_threadfence_system(); (void)atomicExch((uint32_t *)crit, 0); } @@ -446,7 +446,7 @@ lgcpyFct(global_buffer, ModBockId, reduce_data); else lgredFct(global_buffer, ModBockId, reduce_data); - __threadfence_system(); + __kmpc_impl_threadfence_system(); // Increment team counter. // This counter is incremented by all teams in the current diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -163,4 +163,8 @@ : "memory"); } +INLINE void __kmpc_impl_threadfence(void) { __threadfence(); } +INLINE void __kmpc_impl_threadfence_block(void) { __threadfence_block(); } +INLINE void __kmpc_impl_threadfence_system(void) { __threadfence_system(); } + #endif