diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -56,7 +56,7 @@ { __atomic_thread_fence(__ATOMIC_ACQUIRE); - uint32_t num_waves = num_threads / WARPSIZE; + uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE; // Partial barrier implementation for amdgcn. // Uses two 16 bit unsigned counters. One for the number of waves to have