diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -70,6 +70,12 @@
   ${devicertl_base_directory}/common/src/sync.cu
   ${devicertl_base_directory}/common/src/task.cu)
 
+# Functions implemented in IR are used where there is not yet a corresponding
+# intrinsic available in clang. The intent is for these functions to be removed
+# as clang is extended.
+set(llvm_sources
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_atomic.ll)
+
 set(h_files
   ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h
   ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_atomics.h
@@ -137,6 +143,7 @@
 foreach(mcpu ${mcpus})
   set(bc_files)
   add_cuda_bc_library(${cuda_sources})
+  list(APPEND bc_files ${llvm_sources})
 
   set(bc_libname lib${libname}-${mcpu}.bc)
   add_custom_command(
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_atomic.ll b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_atomic.ll
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_atomic.ll
@@ -0,0 +1,28 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+target triple = "amdgcn-amd-amdhsa"
+
+; These functions are implemented in IR as there is not yet a corresponding intrinsic
+; available in clang. The intent is to remove it once said intrinsic is implemented.
+
+declare i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* nocapture, i32, i32, i32, i1) #1
+
+; Function Attrs: alwaysinline nounwind
+define i32 @__amdgcn_atomic_inc_u32(i32* %x, i32 %v) #0 {
+entry:
+  %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %x, i32 %v,
+  i32 5, ; Ordering. AtomicOrdering.h: sequentially consistent
+  i32 2, ; Scope. SyncScope.h:  OpenCLAllSVMDevices is 2
+  i1 0 ; Volatile. False for consistency with other atomic operations
+  )
+  ret i32 %ret
+}
+
+define i64 @__amdgcn_atomic_max_u64(i64* %a, i64 %v) #0 {
+entry:
+  %0 = atomicrmw umax i64* %a, i64 %v seq_cst
+  ret i64 %0
+}
+
+
+attributes #0 = { alwaysinline nounwind }
+attributes #1 = { nounwind argmemonly }
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h
@@ -11,29 +11,35 @@
 
 #include "target_impl.h"
 
-DEVICE unsigned atomicAdd(unsigned *address, unsigned val);
-DEVICE int atomicAdd(int *address, int val);
-DEVICE unsigned long long atomicAdd(unsigned long long *address,
-                                    unsigned long long val);
-
-DEVICE unsigned atomicInc(unsigned *address);
-DEVICE unsigned atomicInc(unsigned *address, unsigned max);
-DEVICE int atomicInc(int *address);
-
-DEVICE int atomicMax(int *address, int val);
-DEVICE unsigned atomicMax(unsigned *address, unsigned val);
-DEVICE unsigned long long atomicMax(unsigned long long *address,
-                                    unsigned long long val);
-
-DEVICE int atomicExch(int *address, int val);
-DEVICE unsigned atomicExch(unsigned *address, unsigned val);
-DEVICE unsigned long long atomicExch(unsigned long long *address,
-                                     unsigned long long val);
-
-DEVICE unsigned atomicCAS(unsigned *address, unsigned compare, unsigned val);
-DEVICE int atomicCAS(int *address, int compare, int val);
-DEVICE unsigned long long atomicCAS(unsigned long long *address,
-                                    unsigned long long compare,
-                                    unsigned long long val);
-
+namespace {
+
+template <typename T> DEVICE T atomicAdd( T *x, T v) {
+  return __atomic_fetch_add(x, v, __ATOMIC_SEQ_CST);
+}
+
+// Only implemented for i32 as that's the only call site
+EXTERN uint32_t __amdgcn_atomic_inc_u32( uint32_t *, uint32_t);
+INLINE uint32_t atomicInc( uint32_t *address, uint32_t val) {
+  return __amdgcn_atomic_inc_u32(address, val);
+}
+
+EXTERN uint64_t __amdgcn_atomic_max_u64(uint64_t *, uint64_t);
+INLINE uint64_t atomicMax(uint64_t *address, uint64_t val) {
+  return __amdgcn_atomic_max_u64(address, val);
+}
+
+template <typename T> DEVICE T atomicExch( T *address, T val) {
+  T r;
+  __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST);
+  return r;
+}
+
+template <typename T>
+DEVICE T atomicCAS( T *address, T compare, T val) {
+  (void)__atomic_compare_exchange(address, &compare, &val, false,
+                                  __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return compare;
+}
+
+} // namespace
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@@ -793,8 +793,7 @@
     // Atomic max of iterations.
     uint64_t *varArray = (uint64_t *)array;
     uint64_t elem = varArray[i];
-    (void)__kmpc_atomic_max((unsigned long long int *)Buffer,
-                            (unsigned long long int)elem);
+    (void)__kmpc_atomic_max(Buffer, elem);
 
     // Barrier.
     syncWorkersInGenericMode(NumThreads);