diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2455,6 +2455,63 @@
 and ``__OPENCL_MEMORY_SCOPE_SUB_GROUP`` are provided, with values
 corresponding to the enumerators of OpenCL's ``memory_scope`` enumeration.)
 
+AMDGCN specific builtins
+-------------------------
+
+``__builtin_amdgcn_fence``
+-------------------------
+
+``__builtin_amdgcn_fence`` allows using `Fence instruction <https://llvm.org/docs/LangRef.html#fence-instruction>`_ 
+from clang. It takes C++11 compatible memory-ordering and AMDGCN-specific
+sync-scope as arguments, and generates a fence instruction in the IR.
+
+**Syntax**:
+
+.. code-block:: c++
+
+    __builtin_amdgcn_fence(unsigned int memory_ordering, String sync_scope)
+
+**Example of use**:
+
+.. code-block:: c++
+
+  void my_fence(int i) {
+    i++;
+    __builtin_amdgcn_fence(__ATOMIC_ACQUIRE,  "workgroup");
+    i--;
+    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,  "agent");
+  }
+
+**Description**:
+
+The first argument of ``__builtin_amdgcn_fence()`` builtin is one of the
+memory-ordering specifiers ``__ATOMIC_ACQUIRE``, ``__ATOMIC_RELEASE``,
+``__ATOMIC_ACQ_REL``, or ``__ATOMIC_SEQ_CST`` following C++11 memory model
+semantics. Equivalent enum values of these memory-ordering can also be 
+specified. The builtin maps these C++ memory-ordering to corresponding
+LLVM Atomic Memory Ordering for the fence instruction using LLVM Atomic C
+ABI, as given in the table below. The second argument is a AMDGCN-specific
+synchronization scope defined as a String. It can take any of the sync scopes
+defined for `AMDHSA LLVM Sync Scopes <https://llvm.org/docs/AMDGPUUsage.html#memory-scopes>`_
+This builtin transparently passes the second argument to fence instruction
+and relies on AMDGCN implementation for validity check.
+
++------------------------------+--------------------------------+
+| Input in clang               | Output in IR                   |
+| (C++11 Memory-ordering)      | (LLVM Atomic Memory-ordering)  |
++======================+=======+========================+=======+
+| Enum                 | Value | Enum                   | Value |
++----------------------+-------+------------------------+-------+
+| ``__ATOMIC_ACQUIRE`` | 2     | Acquire                | 4     |
++----------------------+-------+------------------------+-------+
+| ``__ATOMIC_RELEASE`` | 3     | Release                | 5     |
++----------------------+-------+------------------------+-------+
+| ``__ATOMIC_ACQ_REL`` | 4     | AcquireRelease         | 6     |
++----------------------+-------+------------------------+-------+
+| ``__ATOMIC_SEQ_CST`` | 5     | SequentiallyConsistent | 7     |
++----------------------+-------+------------------------+-------+
+
+
 Low-level ARM exclusive memory builtins
 ---------------------------------------
 
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -785,11 +785,6 @@
 BUILTIN(__sync_fetch_and_umin, "UiUiD*Ui", "n")
 BUILTIN(__sync_fetch_and_umax, "UiUiD*Ui", "n")
 
-// clang builtin to expose llvm fence instruction
-// First argument : uint in range [2, 5] i.e. [acquire, seq_cst]
-// Second argument : target specific sync scope string
-BUILTIN(__builtin_memory_fence, "vUicC*", "n")
-
 // Random libc builtins.
 BUILTIN(__builtin_abort, "v", "Fnr")
 BUILTIN(__builtin_index, "c*cC*i", "Fn")
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -53,6 +53,7 @@
 BUILTIN(__builtin_amdgcn_ds_gws_sema_v, "vUi", "n")
 BUILTIN(__builtin_amdgcn_ds_gws_sema_br, "vUiUi", "n")
 BUILTIN(__builtin_amdgcn_ds_gws_sema_p, "vUi", "n")
+BUILTIN(__builtin_amdgcn_fence, "vUicC*", "n")
 
 // FIXME: Need to disallow constant address space.
 BUILTIN(__builtin_amdgcn_div_scale, "dddbb*", "n")
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11896,6 +11896,7 @@
   bool CheckX86BuiltinGatherScatterScale(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckPPCBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
+  bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
 
   bool SemaBuiltinVAStart(unsigned BuiltinID, CallExpr *TheCall);
   bool SemaBuiltinVAStartARMMicrosoft(CallExpr *Call);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -13618,7 +13618,7 @@
     return Builder.CreateCall(F, { Src0, Src1, Src2 });
   }
 
-  case Builtin::BI__builtin_memory_fence: {
+  case AMDGPU::BI__builtin_amdgcn_fence: {
     llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
     llvm::SyncScope::ID SSID;
     Value *Order = EmitScalarExpr(E->getArg(0));
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1870,34 +1870,6 @@
                   : "__builtin_frame_address")
           << TheCall->getSourceRange();
   } break;
-
-  case Builtin::BI__builtin_memory_fence: {
-    ExprResult Arg = TheCall->getArg(0);
-    auto ArgExpr = Arg.get();
-    Expr::EvalResult ArgResult;
-
-    if(!ArgExpr->EvaluateAsInt(ArgResult, Context)) {
-      Diag(ArgExpr->getExprLoc(), diag::err_typecheck_expect_int)
-        << ArgExpr->getType();
-      return ExprError();
-    }
-    int ord = ArgResult.Val.getInt().getZExtValue();
-
-    // Check valididty of memory ordering as per C11 / C++11's memody model.
-    switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
-      case llvm::AtomicOrderingCABI::acquire:
-      case llvm::AtomicOrderingCABI::release:
-      case llvm::AtomicOrderingCABI::acq_rel:
-      case llvm::AtomicOrderingCABI::seq_cst:
-        break;
-      default: {
-        Diag(ArgExpr->getBeginLoc(),
-            diag::warn_atomic_op_has_invalid_memory_order)
-              << ArgExpr->getSourceRange();
-        return ExprError();
-      }
-    }
-    } break;
   }
 
   // Since the target specific builtins for each arch overlap, only check those
@@ -1948,6 +1920,10 @@
         if (CheckPPCBuiltinFunctionCall(BuiltinID, TheCall))
           return ExprError();
         break;
+      case llvm::Triple::amdgcn:
+        if (CheckAMDGCNBuiltinFunctionCall(BuiltinID, TheCall))
+          return ExprError();
+        break;
       default:
         break;
     }
@@ -2949,6 +2925,37 @@
   return SemaBuiltinConstantArgRange(TheCall, i, l, u);
 }
 
+bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
+  switch (BuiltinID) {
+    case AMDGPU::BI__builtin_amdgcn_fence: {
+      ExprResult Arg = TheCall->getArg(0);
+      auto ArgExpr = Arg.get();
+      Expr::EvalResult ArgResult;
+
+      if(!ArgExpr->EvaluateAsInt(ArgResult, Context)) {
+        return Diag(ArgExpr->getExprLoc(), diag::err_typecheck_expect_int)
+          << ArgExpr->getType();
+      }
+      int ord = ArgResult.Val.getInt().getZExtValue();
+
+      // Check valididty of memory ordering as per C11 / C++11's memody model.
+      switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
+        case llvm::AtomicOrderingCABI::acquire:
+        case llvm::AtomicOrderingCABI::release:
+        case llvm::AtomicOrderingCABI::acq_rel:
+        case llvm::AtomicOrderingCABI::seq_cst:
+          break;
+        default: {
+          return Diag(ArgExpr->getBeginLoc(),
+            diag::warn_atomic_op_has_invalid_memory_order)
+              << ArgExpr->getSourceRange();
+        }
+      }
+    } break; 
+  }
+  return false;
+}
+
 bool Sema::CheckSystemZBuiltinFunctionCall(unsigned BuiltinID,
                                            CallExpr *TheCall) {
   if (BuiltinID == SystemZ::BI__builtin_tabort) {
diff --git a/clang/test/CodeGenCXX/builtin-amdgcn-fence-failure.cpp b/clang/test/CodeGenCXX/builtin-amdgcn-fence-failure.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenCXX/builtin-amdgcn-fence-failure.cpp
@@ -0,0 +1,9 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: not %clang_cc1 %s -S \
+// RUN:   -triple=amdgcn-amd-amdhsa 2>&1 | FileCheck %s
+
+void test_amdgcn_fence_failure() {
+
+  // CHECK: error: Unsupported atomic synchronization scope 
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "foobar");
+}
\ No newline at end of file
diff --git a/clang/test/CodeGenHIP/builtin_memory_fence.cpp b/clang/test/CodeGenCXX/builtin-amdgcn-fence.cpp
rename from clang/test/CodeGenHIP/builtin_memory_fence.cpp
rename to clang/test/CodeGenCXX/builtin-amdgcn-fence.cpp
--- a/clang/test/CodeGenHIP/builtin_memory_fence.cpp
+++ b/clang/test/CodeGenCXX/builtin-amdgcn-fence.cpp
@@ -1,25 +1,22 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 %s -x hip -emit-llvm -O0 -o - \
+// RUN: %clang_cc1 %s -emit-llvm -O0 -o - \
 // RUN:   -triple=amdgcn-amd-amdhsa  | opt -instnamer -S | FileCheck %s
 
 void test_memory_fence_success() {
 // CHECK-LABEL: test_memory_fence_success
 
   // CHECK: fence syncscope("workgroup") seq_cst
-  __builtin_memory_fence(__ATOMIC_SEQ_CST,  "workgroup");
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,  "workgroup");
   
    // CHECK: fence syncscope("agent") acquire
-  __builtin_memory_fence(__ATOMIC_ACQUIRE, "agent");
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent");
 
   // CHECK: fence seq_cst
-  __builtin_memory_fence(__ATOMIC_SEQ_CST, "");
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
 
   // CHECK: fence syncscope("agent") acq_rel
-  __builtin_memory_fence(4, "agent");
+  __builtin_amdgcn_fence(4, "agent");
 
     // CHECK: fence syncscope("workgroup") release
-  __builtin_memory_fence(3, "workgroup");
-
-  // CHECK: fence syncscope("foobar") release
-  __builtin_memory_fence(3, "foobar");
-}
\ No newline at end of file
+  __builtin_amdgcn_fence(3, "workgroup");
+}
diff --git a/clang/test/Sema/builtins.c b/clang/test/Sema/builtins.c
--- a/clang/test/Sema/builtins.c
+++ b/clang/test/Sema/builtins.c
@@ -320,15 +320,3 @@
   // expected-error@+1 {{use of unknown builtin '__builtin_is_constant_evaluated'}}
   return __builtin_is_constant_evaluated();
 }
-
-void test_memory_fence_errors() {
-  __builtin_memory_fence(__ATOMIC_SEQ_CST + 1, "workgroup"); // expected-warning {{memory order argument to atomic operation is invalid}}
-
-  __builtin_memory_fence(__ATOMIC_ACQUIRE - 1, "workgroup"); // expected-warning {{memory order argument to atomic operation is invalid}}
-
-  __builtin_memory_fence(4); // expected-error {{too few arguments to function call, expected 2}}
-
-  __builtin_memory_fence(4, 4, 4); // expected-error {{too many arguments to function call, expected 2}}
-
-  __builtin_memory_fence(3.14, ""); // expected-warning {{implicit conversion from 'double' to 'unsigned int' changes value from 3.14 to 3}}
-}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl
@@ -128,3 +128,11 @@
   *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, a, false); // expected-error {{argument to '__builtin_amdgcn_ds_fmaxf' must be a constant integer}}
   *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, a); // expected-error {{argument to '__builtin_amdgcn_ds_fmaxf' must be a constant integer}}
 }
+
+void test_fence() {
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST + 1, "workgroup"); // expected-warning {{memory order argument to atomic operation is invalid}}
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE - 1, "workgroup"); // expected-warning {{memory order argument to atomic operation is invalid}}
+  __builtin_amdgcn_fence(4); // expected-error {{too few arguments to function call, expected 2}}
+  __builtin_amdgcn_fence(4, 4, 4); // expected-error {{too many arguments to function call, expected 2}}
+  __builtin_amdgcn_fence(3.14, ""); // expected-warning {{implicit conversion from 'double' to 'unsigned int' changes value from 3.14 to 3}}
+}