Index: lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- lib/Target/X86/X86ISelDAGToDAG.cpp
+++ lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Instructions.h"
@@ -193,6 +194,7 @@
     SDNode *Select(SDNode *N) override;
     SDNode *SelectGather(SDNode *N, unsigned Opc);
     SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT);
+    SDNode *SelectAtomicAddZero(SDNode *Node, MVT NVT);
 
     bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
     bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
@@ -1743,6 +1745,80 @@
   return Val;
 }
 
+// On x86 an atomic load-add of the constant 0 can be replaced by an mfence
+// followed by a mov. A detailed explanation of this (and exemple of why the
+// mfence is required) is available at
+// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
+// The general idea is that only a store operation can have release
+// semantics so a seqlock (implemented entirely with loads) needs a release
+// operation at the end of critical sections, to prevent operations from
+// being sunk out of the critical section. Replacing the last load by a
+// fetch_add(0, release) accomplishes just that.. but requires this
+// optimization to preserve the desirable property of seqlocks that readers
+// do not cause cache line bouncing.
+// The mfence is required because otherwise the load could be hoisted before
+// a preceding store (according to the x86 memory model), which the original
+// fetch_add could not do (since only store-load can be reordered in
+// load-store) on X86.
+SDNode *X86DAGToDAGISel::SelectAtomicAddZero(SDNode *Node, MVT NVT) {
+  assert(Node->getOpcode() == ISD::ATOMIC_LOAD_ADD);
+
+  SDLoc dl(Node);
+
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ptr = Node->getOperand(1);
+  SDValue Val = Node->getOperand(2);
+  SDValue Base, Scale, Index, Disp, Segment;
+  if (!SelectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment))
+    return nullptr;
+
+  auto CN = dyn_cast<ConstantSDNode>(Val);
+  if (!CN)
+    return nullptr;
+
+  int64_t CNVal = CN->getSExtValue();
+  if (CNVal != 0)
+    return nullptr;
+
+  auto FenceNode = CurDAG->getMachineNode(X86::MFENCE, dl, MVT::Other, Chain);
+
+  unsigned Opc;
+  switch (NVT.SimpleTy) {
+  case MVT::i8:
+    Opc = X86::ACQUIRE_MOV8rm;
+    break;
+  case MVT::i16:
+    Opc = X86::ACQUIRE_MOV16rm;
+    break;
+  case MVT::i32:
+    Opc = X86::ACQUIRE_MOV32rm;
+    break;
+  case MVT::i64:
+    Opc = X86::ACQUIRE_MOV64rm;
+    break;
+  default:
+    llvm_unreachable("Unexpected size for LXADD 0");
+  }
+
+  // Note that FenceNode is used for the 'chain' operand, guaranteeing that it
+  // will be scheduled before the load.
+  SDValue FenceChain = SDValue(FenceNode, 0);
+  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, FenceChain};
+  auto LoadNode = CurDAG->getMachineNode(Opc, dl, NVT, MVT::Other, Ops);
+
+  // We must copy the information about the memory operand, but change the flags
+  // to remove the mayStore flag.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  auto SMemOp = cast<MemSDNode>(Node)->getMemOperand();
+  auto Flags = SMemOp->getFlags() & ~MachineMemOperand::MOStore;
+  MemOp[0] = new MachineMemOperand(SMemOp->getPointerInfo(), Flags,
+                                   SMemOp->getSize(), SMemOp->getAlignment(),
+                                   SMemOp->getAAInfo(), SMemOp->getRanges());
+  cast<MachineSDNode>(LoadNode)->setMemRefs(MemOp, MemOp + 1);
+
+  return LoadNode;
+}
+
 SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
   if (Node->hasAnyUseOfValue(0))
     return nullptr;
@@ -2106,13 +2182,14 @@
   case X86ISD::GlobalBaseReg:
     return getGlobalBaseReg();
 
-
+  case ISD::ATOMIC_LOAD_ADD:
+    if (SDNode *RetVal = SelectAtomicAddZero(Node, NVT))
+      return RetVal;
+  /* FALLTHROUGH */
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_AND:
-  case ISD::ATOMIC_LOAD_OR:
-  case ISD::ATOMIC_LOAD_ADD: {
-    SDNode *RetVal = SelectAtomicLoadArith(Node, NVT);
-    if (RetVal)
+  case ISD::ATOMIC_LOAD_OR: {
+    if (SDNode *RetVal = SelectAtomicLoadArith(Node, NVT))
       return RetVal;
     break;
   }
Index: test/CodeGen/X86/atomic_add_zero.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/atomic_add_zero.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+
+; On x86, an atomic rmw operation that does not modify the value in memory
+; (such as atomic add 0) can be replaced by an mfence followed by a mov.
+; This is explained (with the motivation for such an optimisation) in
+; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
+
+define i8 @add8(i8* %p) {
+; CHECK-LABEL: add8
+; CHECK: mfence
+; CHECK: movb
+  %1 = atomicrmw add i8* %p, i8 0 monotonic
+  ret i8 %1
+}
+
+define i16 @add16(i16* %p) {
+; CHECK-LABEL: add16
+; CHECK: mfence
+; CHECK: movw
+  %1 = atomicrmw add i16* %p, i16 0 monotonic
+  ret i16 %1
+}
+
+define i32 @add32(i32* %p) {
+; CHECK-LABEL: add32
+; CHECK: mfence
+; CHECK: movl
+  %1 = atomicrmw add i32* %p, i32 0 monotonic
+  ret i32 %1
+}
+
+define i64 @add64(i64* %p) {
+; CHECK-LABEL: add64
+; X64: mfence
+; X64: movq
+; X32-NOT: mfence
+  %1 = atomicrmw add i64* %p, i64 0 monotonic
+  ret i64 %1
+}
+
+define i128 @add128(i128* %p) {
+; CHECK-LABEL: add128
+; CHECK-NOT: mfence
+  %1 = atomicrmw add i128* %p, i128 0 monotonic
+  ret i128 %1
+}