diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -88,88 +88,36 @@
 
     uint64_t SyncScopeID =
         cast<ConstantSDNode>(Node->getOperand(2).getNode())->getZExtValue();
+    MachineSDNode *Fence = nullptr;
     switch (SyncScopeID) {
-    case SyncScope::SingleThread: {
+    case SyncScope::SingleThread:
       // We lower a single-thread fence to a pseudo compiler barrier instruction
       // preventing instruction reordering. This will not be emitted in final
       // binary.
-      MachineSDNode *Fence =
-          CurDAG->getMachineNode(WebAssembly::COMPILER_FENCE,
-                                 DL,                 // debug loc
-                                 MVT::Other,         // outchain type
-                                 Node->getOperand(0) // inchain
-          );
-      ReplaceNode(Node, Fence);
-      CurDAG->RemoveDeadNode(Node);
-      return;
-    }
-
-    case SyncScope::System: {
-      // For non-emscripten systems, we have not decided on what we should
-      // traslate fences to yet.
-      if (!Subtarget->getTargetTriple().isOSEmscripten())
-        report_fatal_error(
-            "ATOMIC_FENCE is not yet supported in non-emscripten OSes");
-
-      // Wasm does not have a fence instruction, but because all atomic
-      // instructions in wasm are sequentially consistent, we translate a
-      // fence to an idempotent atomic RMW instruction to a linear memory
-      // address. All atomic instructions in wasm are sequentially consistent,
-      // but this is to ensure a fence also prevents reordering of non-atomic
-      // instructions in the VM. Even though LLVM IR's fence instruction does
-      // not say anything about its relationship with non-atomic instructions,
-      // we think this is more user-friendly.
-      //
-      // While any address can work, here we use a value stored in
-      // __stack_pointer wasm global because there's high chance that area is
-      // in cache.
-      //
-      // So the selected instructions will be in the form of:
-      //   %addr = get_global $__stack_pointer
-      //   %0 = i32.const 0
-      //   i32.atomic.rmw.or %addr, %0
-      SDValue StackPtrSym = CurDAG->getTargetExternalSymbol(
-          "__stack_pointer", TLI->getPointerTy(CurDAG->getDataLayout()));
-      MachineSDNode *GetGlobal =
-          CurDAG->getMachineNode(WebAssembly::GLOBAL_GET_I32, // opcode
-                                 DL,                          // debug loc
-                                 MVT::i32,                    // result type
-                                 StackPtrSym // __stack_pointer symbol
-          );
-
-      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      auto *MMO = MF.getMachineMemOperand(
-          MachinePointerInfo::getUnknownStack(MF),
-          // FIXME Volatile isn't really correct, but currently all LLVM
-          // atomic instructions are treated as volatiles in the backend, so
-          // we should be consistent.
-          MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad |
-              MachineMemOperand::MOStore,
-          4, 4, AAMDNodes(), nullptr, SyncScope::System,
-          AtomicOrdering::SequentiallyConsistent);
-      MachineSDNode *Const0 =
-          CurDAG->getMachineNode(WebAssembly::CONST_I32, DL, MVT::i32, Zero);
-      MachineSDNode *AtomicRMW = CurDAG->getMachineNode(
-          WebAssembly::ATOMIC_RMW_OR_I32, // opcode
-          DL,                             // debug loc
-          MVT::i32,                       // result type
-          MVT::Other,                     // outchain type
-          {
-              Zero,                  // alignment
-              Zero,                  // offset
-              SDValue(GetGlobal, 0), // __stack_pointer
-              SDValue(Const0, 0),    // OR with 0 to make it idempotent
-              Node->getOperand(0)    // inchain
-          });
-
-      CurDAG->setNodeMemRefs(AtomicRMW, {MMO});
-      ReplaceUses(SDValue(Node, 0), SDValue(AtomicRMW, 1));
-      CurDAG->RemoveDeadNode(Node);
-      return;
-    }
+      Fence = CurDAG->getMachineNode(WebAssembly::COMPILER_FENCE,
+                                     DL,                 // debug loc
+                                     MVT::Other,         // outchain type
+                                     Node->getOperand(0) // inchain
+      );
+      break;
+    case SyncScope::System:
+      // Currently wasm only supports sequentially consistent atomics, so we
+      // always set the order to 0 (sequentially consistent).
+      Fence = CurDAG->getMachineNode(
+          WebAssembly::ATOMIC_FENCE,
+          DL,                                         // debug loc
+          MVT::Other,                                 // outchain type
+          CurDAG->getTargetConstant(0, DL, MVT::i32), // order
+          Node->getOperand(0)                         // inchain
+      );
+      break;
     default:
       llvm_unreachable("Unknown scope!");
     }
+
+    ReplaceNode(Node, Fence);
+    CurDAG->RemoveDeadNode(Node);
+    return;
   }
 
   case ISD::GlobalTLSAddress: {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -896,4 +896,7 @@
 let Defs = [ARGUMENTS] in {
 let isPseudo = 1, hasSideEffects = 1 in
 defm COMPILER_FENCE : ATOMIC_NRI<(outs), (ins), [], "compiler_fence">;
+let hasSideEffects = 1 in
+defm ATOMIC_FENCE : ATOMIC_NRI<(outs), (ins i8imm:$flags), [], "atomic.fence",
+                               0x03>;
 } // Defs = [ARGUMENTS]
diff --git a/llvm/test/CodeGen/WebAssembly/atomic-fence.ll b/llvm/test/CodeGen/WebAssembly/atomic-fence.ll
--- a/llvm/test/CodeGen/WebAssembly/atomic-fence.ll
+++ b/llvm/test/CodeGen/WebAssembly/atomic-fence.ll
@@ -1,19 +1,14 @@
 ; RUN: llc < %s | FileCheck %s --check-prefix NOATOMIC
-; RUN: not llc < %s -mtriple=wasm32-unknown-unknown -mattr=+atomics,+sign-ext 2>&1 | FileCheck %s --check-prefixes NOEMSCRIPTEN
-; RUN: not llc < %s -mtriple=wasm32-unknown-wasi -mattr=+atomics,+sign-ext 2>&1 | FileCheck %s --check-prefixes NOEMSCRIPTEN
-; RUN: llc < %s -mtriple=wasm32-unknown-emscripten -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+atomics,+sign-ext | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+atomics | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; NOEMSCRIPTEN: LLVM ERROR: ATOMIC_FENCE is not yet supported in non-emscripten OSes
 
-; A multithread fence turns into 'global.get $__stack_pointer' followed by an
-; idempotent atomicrmw instruction.
+; A multithread fence is lowered to an atomic.fence instruction.
 ; CHECK-LABEL: multithread_fence:
-; CHECK:      global.get  $push[[SP:[0-9]+]]=, __stack_pointer
-; CHECK-NEXT: i32.const $push[[ZERO:[0-9]+]]=, 0
-; CHECK-NEXT: i32.atomic.rmw.or  $drop=, 0($pop[[SP]]), $pop[[ZERO]]
+; CHECK:  atomic.fence
 ; NOATOMIC-NOT: i32.atomic.rmw.or
 define void @multithread_fence() {
   fence seq_cst
@@ -23,10 +18,9 @@
 ; Fences with weaker memory orderings than seq_cst should be treated the same
 ; because atomic memory access in wasm are sequentially consistent.
 ; CHECK-LABEL: multithread_weak_fence:
-; CHECK:  global.get  $push{{.+}}=, __stack_pointer
-; CHECK:  i32.atomic.rmw.or
-; CHECK:  i32.atomic.rmw.or
-; CHECK:  i32.atomic.rmw.or
+; CHECK:       atomic.fence
+; CHECK-NEXT:  atomic.fence
+; CHECK-NEXT:  atomic.fence
 define void @multithread_weak_fence() {
   fence acquire
   fence release
diff --git a/llvm/test/MC/WebAssembly/atomics-encodings.s b/llvm/test/MC/WebAssembly/atomics-encodings.s
--- a/llvm/test/MC/WebAssembly/atomics-encodings.s
+++ b/llvm/test/MC/WebAssembly/atomics-encodings.s
@@ -145,4 +145,7 @@
   # CHECK:  i64.atomic.rmw32.cmpxchg_u 0 # encoding: [0xfe,0x4e,0x02,0x00]
   i64.atomic.rmw32.cmpxchg_u 0
 
+  # CHECK: atomic.fence # encoding: [0xfe,0x03,0x00]
+  atomic.fence
+
   end_function