diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -109,11 +109,11 @@
   // AMDGCN doesn't need to sync threads in a warp
 }
 
-INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
-  (void)num_threads;
-  // TODO: Implement on top of __SHARED__
-  __builtin_amdgcn_s_barrier();
-}
+// AMDGCN specific kernel initialization
+DEVICE void __kmpc_impl_target_init();
+
+// Equivalent to ptx bar.sync 1. Barrier until num_threads arrive.
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);
 
 INLINE void __kmpc_impl_threadfence() {
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -62,6 +62,59 @@
   return __builtin_amdgcn_ds_bpermute(index << 2, var);
 }
 
+static DEVICE SHARED uint32_t L1_Barrier;
+
+DEVICE void __kmpc_impl_target_init() {
+  // Don't have global ctors, and shared memory is not zero init
+  __atomic_store_n(&L1_Barrier, 0u, __ATOMIC_RELEASE);
+}
+
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
+  __atomic_thread_fence(__ATOMIC_ACQUIRE);
+
+  uint32_t num_waves = num_threads / WARPSIZE;
+
+  // Partial barrier implementation for amdgcn.
+  // Uses two 16 bit unsigned counters. One for the number of waves to have
+  // reached the barrier, and one to count how many times the barrier has been
+  // passed. These are packed in a single atomically accessed 32 bit integer.
+  // Low bits for the number of waves, assumed zero before this call.
+  // High bits to count the number of times the barrier has been passed.
+
+  assert(num_waves != 0);
+  assert(num_waves * WARPSIZE == num_threads);
+  assert(num_waves < 0xffffu);
+
+  // Increment the low 16 bits once, using the lowest active thread.
+  uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
+  bool isLowest = GetLaneId() == lowestActiveThread;
+
+  if (isLowest) {
+    uint32_t load =
+        __atomic_fetch_add(&L1_Barrier, 1, __ATOMIC_RELAXED); // commutative
+
+    // Record the number of times the barrier has been passed
+    uint32_t generation = load & 0xffff0000u;
+
+    if ((load & 0x0000ffffu) == (num_waves - 1)) {
+      // Reached num_waves in low bits so this is the last wave.
+      // Set low bits to zero and increment high bits
+      load += 0x00010000u; // wrap is safe
+      load &= 0xffff0000u; // because bits zeroed second
+
+      // Reset the wave counter and release the waiting waves
+      __atomic_store_n(&L1_Barrier, load, __ATOMIC_RELAXED);
+    } else {
+      // more waves still to go, spin until generation counter changes
+      do {
+        __builtin_amdgcn_s_sleep(0);
+        load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED);
+      } while ((load & 0xffff0000u) == generation);
+    }
+  }
+  __atomic_thread_fence(__ATOMIC_RELEASE);
+}
+
 EXTERN uint64_t __ockl_get_local_size(uint32_t);
 EXTERN uint64_t __ockl_get_num_groups(uint32_t);
 DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); }
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -63,6 +63,7 @@
       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
   nThreads = GetNumberOfThreadsInBlock();
   threadLimit = ThreadLimit;
+  __kmpc_impl_target_init();
 }
 
 EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -183,6 +183,11 @@
 #endif // CUDA_VERSION
 }
 
+// NVPTX specific kernel initialization
+INLINE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
+}
+
+// Barrier until num_threads arrive.
 INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
   // The named barrier for active parallel threads of a team in an L1 parallel
   // region to synchronize with each other.