diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -88,88 +88,36 @@
 
     uint64_t SyncScopeID =
         cast<ConstantSDNode>(Node->getOperand(2).getNode())->getZExtValue();
+    MachineSDNode *Fence = nullptr;
     switch (SyncScopeID) {
-    case SyncScope::SingleThread: {
+    case SyncScope::SingleThread:
       // We lower a single-thread fence to a pseudo compiler barrier instruction
       // preventing instruction reordering. This will not be emitted in final
       // binary.
-      MachineSDNode *Fence =
-          CurDAG->getMachineNode(WebAssembly::COMPILER_FENCE,
-                                 DL,                 // debug loc
-                                 MVT::Other,         // outchain type
-                                 Node->getOperand(0) // inchain
-          );
-      ReplaceNode(Node, Fence);
-      CurDAG->RemoveDeadNode(Node);
-      return;
-    }
-
-    case SyncScope::System: {
-      // For non-emscripten systems, we have not decided on what we should
-      // traslate fences to yet.
-      if (!Subtarget->getTargetTriple().isOSEmscripten())
-        report_fatal_error(
-            "ATOMIC_FENCE is not yet supported in non-emscripten OSes");
-
-      // Wasm does not have a fence instruction, but because all atomic
-      // instructions in wasm are sequentially consistent, we translate a
-      // fence to an idempotent atomic RMW instruction to a linear memory
-      // address. All atomic instructions in wasm are sequentially consistent,
-      // but this is to ensure a fence also prevents reordering of non-atomic
-      // instructions in the VM. Even though LLVM IR's fence instruction does
-      // not say anything about its relationship with non-atomic instructions,
-      // we think this is more user-friendly.
-      //
-      // While any address can work, here we use a value stored in
-      // __stack_pointer wasm global because there's high chance that area is
-      // in cache.
-      //
-      // So the selected instructions will be in the form of:
-      //   %addr = get_global $__stack_pointer
-      //   %0 = i32.const 0
-      //   i32.atomic.rmw.or %addr, %0
-      SDValue StackPtrSym = CurDAG->getTargetExternalSymbol(
-          "__stack_pointer", TLI->getPointerTy(CurDAG->getDataLayout()));
-      MachineSDNode *GetGlobal =
-          CurDAG->getMachineNode(WebAssembly::GLOBAL_GET_I32, // opcode
-                                 DL,                          // debug loc
-                                 MVT::i32,                    // result type
-                                 StackPtrSym // __stack_pointer symbol
-          );
-
-      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      auto *MMO = MF.getMachineMemOperand(
-          MachinePointerInfo::getUnknownStack(MF),
-          // FIXME Volatile isn't really correct, but currently all LLVM
-          // atomic instructions are treated as volatiles in the backend, so
-          // we should be consistent.
-          MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad |
-              MachineMemOperand::MOStore,
-          4, 4, AAMDNodes(), nullptr, SyncScope::System,
-          AtomicOrdering::SequentiallyConsistent);
-      MachineSDNode *Const0 =
-          CurDAG->getMachineNode(WebAssembly::CONST_I32, DL, MVT::i32, Zero);
-      MachineSDNode *AtomicRMW = CurDAG->getMachineNode(
-          WebAssembly::ATOMIC_RMW_OR_I32, // opcode
-          DL,                             // debug loc
-          MVT::i32,                       // result type
-          MVT::Other,                     // outchain type
-          {
-              Zero,                  // alignment
-              Zero,                  // offset
-              SDValue(GetGlobal, 0), // __stack_pointer
-              SDValue(Const0, 0),    // OR with 0 to make it idempotent
-              Node->getOperand(0)    // inchain
-          });
-
-      CurDAG->setNodeMemRefs(AtomicRMW, {MMO});
-      ReplaceUses(SDValue(Node, 0), SDValue(AtomicRMW, 1));
-      CurDAG->RemoveDeadNode(Node);
-      return;
-    }
+      Fence = CurDAG->getMachineNode(WebAssembly::COMPILER_FENCE,
+                                     DL,                 // debug loc
+                                     MVT::Other,         // outchain type
+                                     Node->getOperand(0) // inchain
+      );
+      break;
+    case SyncScope::System:
+      // Currently wasm only supports sequentially consistent atomics, so we
+      // always set the order to 0 (sequentially consistent).
+      Fence = CurDAG->getMachineNode(
+          WebAssembly::ATOMIC_FENCE,
+          DL,                                         // debug loc
+          MVT::Other,                                 // outchain type
+          CurDAG->getTargetConstant(0, DL, MVT::i32), // order
+          Node->getOperand(0)                         // inchain
+      );
+      break;
     default:
       llvm_unreachable("Unknown scope!");
     }
+
+    ReplaceNode(Node, Fence);
+    CurDAG->RemoveDeadNode(Node);
+    return;
   }
 
   case ISD::GlobalTLSAddress: {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -126,6 +126,19 @@
 def : WaitPatGlobalAddrOffOnly<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
 } // Predicates = [HasAtomics]
 
+//===----------------------------------------------------------------------===//
+// Atomic fences
+//===----------------------------------------------------------------------===//
+
+// A compiler fence instruction that prevents reordering of instructions.
+let Defs = [ARGUMENTS] in {
+let isPseudo = 1, hasSideEffects = 1 in
+defm COMPILER_FENCE : ATOMIC_NRI<(outs), (ins), [], "compiler_fence">;
+let hasSideEffects = 1 in
+defm ATOMIC_FENCE : ATOMIC_NRI<(outs), (ins i8imm:$flags), [], "atomic.fence",
+                               0x03>;
+} // Defs = [ARGUMENTS]
+
 //===----------------------------------------------------------------------===//
 // Atomic loads
 //===----------------------------------------------------------------------===//
@@ -887,13 +900,3 @@
   ATOMIC_RMW8_U_CMPXCHG_I32, ATOMIC_RMW16_U_CMPXCHG_I32,
   ATOMIC_RMW8_U_CMPXCHG_I64, ATOMIC_RMW16_U_CMPXCHG_I64,
   ATOMIC_RMW32_U_CMPXCHG_I64>;
-
-//===----------------------------------------------------------------------===//
-// Atomic fences
-//===----------------------------------------------------------------------===//
-
-// A compiler fence instruction that prevents reordering of instructions.
-let Defs = [ARGUMENTS] in {
-let isPseudo = 1, hasSideEffects = 1 in
-defm COMPILER_FENCE : ATOMIC_NRI<(outs), (ins), [], "compiler_fence">;
-} // Defs = [ARGUMENTS]
diff --git a/llvm/test/CodeGen/WebAssembly/atomic-fence.ll b/llvm/test/CodeGen/WebAssembly/atomic-fence.ll
--- a/llvm/test/CodeGen/WebAssembly/atomic-fence.ll
+++ b/llvm/test/CodeGen/WebAssembly/atomic-fence.ll
@@ -1,19 +1,12 @@
 ; RUN: llc < %s | FileCheck %s --check-prefix NOATOMIC
-; RUN: not llc < %s -mtriple=wasm32-unknown-unknown -mattr=+atomics,+sign-ext 2>&1 | FileCheck %s --check-prefixes NOEMSCRIPTEN
-; RUN: not llc < %s -mtriple=wasm32-unknown-wasi -mattr=+atomics,+sign-ext 2>&1 | FileCheck %s --check-prefixes NOEMSCRIPTEN
-; RUN: llc < %s -mtriple=wasm32-unknown-emscripten -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+atomics,+sign-ext | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+atomics | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
-; NOEMSCRIPTEN: LLVM ERROR: ATOMIC_FENCE is not yet supported in non-emscripten OSes
-
-; A multithread fence turns into 'global.get $__stack_pointer' followed by an
-; idempotent atomicrmw instruction.
+; A multithread fence is lowered to an atomic.fence instruction.
 ; CHECK-LABEL: multithread_fence:
-; CHECK:      global.get  $push[[SP:[0-9]+]]=, __stack_pointer
-; CHECK-NEXT: i32.const $push[[ZERO:[0-9]+]]=, 0
-; CHECK-NEXT: i32.atomic.rmw.or  $drop=, 0($pop[[SP]]), $pop[[ZERO]]
+; CHECK:  atomic.fence
 ; NOATOMIC-NOT: i32.atomic.rmw.or
 define void @multithread_fence() {
   fence seq_cst
@@ -23,10 +16,9 @@
 ; Fences with weaker memory orderings than seq_cst should be treated the same
 ; because atomic memory access in wasm are sequentially consistent.
 ; CHECK-LABEL: multithread_weak_fence:
-; CHECK:  global.get  $push{{.+}}=, __stack_pointer
-; CHECK:  i32.atomic.rmw.or
-; CHECK:  i32.atomic.rmw.or
-; CHECK:  i32.atomic.rmw.or
+; CHECK:       atomic.fence
+; CHECK-NEXT:  atomic.fence
+; CHECK-NEXT:  atomic.fence
 define void @multithread_weak_fence() {
   fence acquire
   fence release
@@ -37,7 +29,8 @@
 ; A singlethread fence becomes compiler_fence instruction, a pseudo instruction
 ; that acts as a compiler barrier. The barrier should not be emitted to .s file.
 ; CHECK-LABEL: singlethread_fence:
-; CHECK-NOT:  compiler_fence
+; CHECK-NOT: compiler_fence
+; CHECK-NOT: atomic_fence
 define void @singlethread_fence() {
   fence syncscope("singlethread") seq_cst
   fence syncscope("singlethread") acquire
diff --git a/llvm/test/CodeGen/WebAssembly/atomic-fence.mir b/llvm/test/CodeGen/WebAssembly/atomic-fence.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/atomic-fence.mir
@@ -0,0 +1,68 @@
+# RUN: llc -mtriple=wasm32-unknown-unknown -run-pass wasm-reg-stackify -run-pass wasm-explicit-locals %s -o - | FileCheck %s
+
+# In the two tests below, without compiler_fence or atomic.fence in between,
+# atomic.notify and i32.add will be reordered by register stackify pass to meet
+# 'call @foo''s requirements. But because we have fences between atomic.notify
+# and i32.add, they cannot be reordered, and local.set and local.get are
+# inserted to save and load atomic.notify's return value.
+
+--- |
+  target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+  target triple = "wasm32-unknown-unknown"
+
+  declare void @foo(i32, i32)
+  define void @compiler_fence_test(i32) {
+    ret void
+  }
+  define void @atomic_fence_test(i32) {
+    ret void
+  }
+...
+---
+# CHECK-LABEL: name: compiler_fence_test
+name: compiler_fence_test
+liveins:
+  - { reg: '$arguments' }
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK: %[[REG:[0-9]+]]:i32 = ATOMIC_NOTIFY
+    ; CHECK: LOCAL_SET_I32 [[LOCAL:[0-9]+]], %[[REG]]
+    ; CHECK: COMPILER_FENCE
+    ; CHECK: ADD_I32
+    ; CHECK: LOCAL_GET_I32 [[LOCAL]]
+    ; CHECK: CALL_VOID @foo
+
+    liveins: $arguments
+    %0:i32 = CONST_I32 0, implicit-def $arguments
+    %1:i32 = ATOMIC_NOTIFY 2, 0, %0:i32, %0:i32, implicit-def $arguments
+    COMPILER_FENCE implicit-def $arguments
+    %2:i32 = ADD_I32 %0:i32, %0:i32, implicit-def $arguments
+    CALL_VOID @foo, %2:i32, %1:i32, implicit-def $arguments
+    RETURN_VOID implicit-def $arguments
+...
+
+---
+# CHECK-LABEL: name: atomic_fence_test
+name: atomic_fence_test
+liveins:
+  - { reg: '$arguments' }
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK: %[[REG:[0-9]+]]:i32 = ATOMIC_NOTIFY
+    ; CHECK: LOCAL_SET_I32 [[LOCAL:[0-9]+]], %[[REG]]
+    ; CHECK: ATOMIC_FENCE
+    ; CHECK: ADD_I32
+    ; CHECK: LOCAL_GET_I32 [[LOCAL]]
+    ; CHECK: CALL_VOID @foo
+
+    liveins: $arguments
+    %0:i32 = CONST_I32 0, implicit-def $arguments
+    %1:i32 = ATOMIC_NOTIFY 2, 0, %0:i32, %0:i32, implicit-def $arguments
+    ATOMIC_FENCE 0, implicit-def $arguments
+    %2:i32 = ADD_I32 %0:i32, %0:i32, implicit-def $arguments
+    CALL_VOID @foo, %2:i32, %1:i32, implicit-def $arguments
+    RETURN_VOID implicit-def $arguments
+...
+
diff --git a/llvm/test/MC/WebAssembly/atomics-encodings.s b/llvm/test/MC/WebAssembly/atomics-encodings.s
--- a/llvm/test/MC/WebAssembly/atomics-encodings.s
+++ b/llvm/test/MC/WebAssembly/atomics-encodings.s
@@ -10,6 +10,9 @@
   # CHECK:  i64.atomic.wait 0 # encoding: [0xfe,0x02,0x03,0x00]
   i64.atomic.wait 0
 
+  # CHECK: atomic.fence # encoding: [0xfe,0x03,0x00]
+  atomic.fence
+
   # CHECK: i32.atomic.load 0 # encoding: [0xfe,0x10,0x02,0x00]
   i32.atomic.load 0
   # CHECK: i64.atomic.load 4 # encoding: [0xfe,0x11,0x03,0x04]