Index: llvm/trunk/include/llvm/Target/TargetLowering.h
===================================================================
--- llvm/trunk/include/llvm/Target/TargetLowering.h
+++ llvm/trunk/include/llvm/Target/TargetLowering.h
@@ -964,29 +964,54 @@
   /// It is called by AtomicExpandPass before expanding an
   ///   AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad.
   /// RMW and CmpXchg set both IsStore and IsLoad to true.
-  /// Backends with !getInsertFencesForAtomic() should keep a no-op here.
   /// This function should either return a nullptr, or a pointer to an IR-level
   ///   Instruction*. Even complex fence sequences can be represented by a
   ///   single Instruction* through an intrinsic to be lowered later.
+  /// Backends with !getInsertFencesForAtomic() should keep a no-op here.
+  /// Backends should override this method to produce target-specific intrinsic
+  ///   for their fences.
+  /// FIXME: Please note that the default implementation here in terms of
+  ///   IR-level fences exists for historical/compatibility reasons and is
+  ///   *unsound* ! Fences cannot, in general, be used to restore sequential
+  ///   consistency. For example, consider the following example:
+  /// atomic<int> x = y = 0;
+  /// int r1, r2, r3, r4;
+  /// Thread 0:
+  ///   x.store(1);
+  /// Thread 1:
+  ///   y.store(1);
+  /// Thread 2:
+  ///   r1 = x.load();
+  ///   r2 = y.load();
+  /// Thread 3:
+  ///   r3 = y.load();
+  ///   r4 = x.load();
+  ///  r1 = r3 = 1 and r2 = r4 = 0 is impossible as long as the accesses are all
+  ///  seq_cst. But if they are lowered to monotonic accesses, no amount of
+  ///  IR-level fences can prevent it.
+  /// @{
   virtual Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
           bool IsStore, bool IsLoad) const {
-    assert(!getInsertFencesForAtomic());
-    return nullptr;
+    if (!getInsertFencesForAtomic())
+      return nullptr;
+
+    if (isAtLeastRelease(Ord) && IsStore)
+      return Builder.CreateFence(Ord);
+    else
+      return nullptr;
   }
 
-  /// Inserts in the IR a target-specific intrinsic specifying a fence.
-  /// It is called by AtomicExpandPass after expanding an
-  ///   AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad.
-  /// RMW and CmpXchg set both IsStore and IsLoad to true.
-  /// Backends with !getInsertFencesForAtomic() should keep a no-op here.
-  /// This function should either return a nullptr, or a pointer to an IR-level
-  ///   Instruction*. Even complex fence sequences can be represented by a
-  ///   single Instruction* through an intrinsic to be lowered later.
   virtual Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
           bool IsStore, bool IsLoad) const {
-    assert(!getInsertFencesForAtomic());
-    return nullptr;
+    if (!getInsertFencesForAtomic())
+      return nullptr;
+
+    if (isAtLeastAcquire(Ord))
+      return Builder.CreateFence(Ord);
+    else
+      return nullptr;
   }
+  /// @}
 
   /// Returns true if the given (atomic) store should be expanded by the
   /// IR-level AtomicExpand pass into an "atomic xchg" which ignores its input.
Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3604,30 +3604,6 @@
   DAG.setRoot(StoreNode);
 }
 
-static SDValue InsertFenceForAtomic(SDValue Chain, AtomicOrdering Order,
-                                    SynchronizationScope Scope,
-                                    bool Before, SDLoc dl,
-                                    SelectionDAG &DAG,
-                                    const TargetLowering &TLI) {
-  // Fence, if necessary
-  if (Before) {
-    if (Order == AcquireRelease || Order == SequentiallyConsistent)
-      Order = Release;
-    else if (Order == Acquire || Order == Monotonic || Order == Unordered)
-      return Chain;
-  } else {
-    if (Order == AcquireRelease)
-      Order = Acquire;
-    else if (Order == Release || Order == Monotonic || Order == Unordered)
-      return Chain;
-  }
-  SDValue Ops[3];
-  Ops[0] = Chain;
-  Ops[1] = DAG.getConstant(Order, TLI.getPointerTy());
-  Ops[2] = DAG.getConstant(Scope, TLI.getPointerTy());
-  return DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops);
-}
-
 void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
   SDLoc dl = getCurSDLoc();
   AtomicOrdering SuccessOrder = I.getSuccessOrdering();
@@ -3636,27 +3612,16 @@
 
   SDValue InChain = getRoot();
 
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.getInsertFencesForAtomic())
-    InChain =
-        InsertFenceForAtomic(InChain, SuccessOrder, Scope, true, dl, DAG, TLI);
-
   MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType();
   SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other);
   SDValue L = DAG.getAtomicCmpSwap(
       ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, MemVT, VTs, InChain,
       getValue(I.getPointerOperand()), getValue(I.getCompareOperand()),
       getValue(I.getNewValOperand()), MachinePointerInfo(I.getPointerOperand()),
-      0 /* Alignment */,
-      TLI.getInsertFencesForAtomic() ? Monotonic : SuccessOrder,
-      TLI.getInsertFencesForAtomic() ? Monotonic : FailureOrder, Scope);
+      /*Alignment=*/ 0, SuccessOrder, FailureOrder, Scope);
 
   SDValue OutChain = L.getValue(2);
 
-  if (TLI.getInsertFencesForAtomic())
-    OutChain = InsertFenceForAtomic(OutChain, SuccessOrder, Scope, false, dl,
-                                    DAG, TLI);
-
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
@@ -3683,22 +3648,17 @@
 
   SDValue InChain = getRoot();
 
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.getInsertFencesForAtomic())
-    InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl, DAG, TLI);
-
-  SDValue L = DAG.getAtomic(
-      NT, dl, getValue(I.getValOperand()).getSimpleValueType(), InChain,
-      getValue(I.getPointerOperand()), getValue(I.getValOperand()),
-      I.getPointerOperand(), 0 /* Alignment */,
-      TLI.getInsertFencesForAtomic() ? Monotonic : Order, Scope);
+  SDValue L =
+    DAG.getAtomic(NT, dl,
+                  getValue(I.getValOperand()).getSimpleValueType(),
+                  InChain,
+                  getValue(I.getPointerOperand()),
+                  getValue(I.getValOperand()),
+                  I.getPointerOperand(),
+                  /* Alignment=*/ 0, Order, Scope);
 
   SDValue OutChain = L.getValue(1);
 
-  if (TLI.getInsertFencesForAtomic())
-    OutChain =
-        InsertFenceForAtomic(OutChain, Order, Scope, false, dl, DAG, TLI);
-
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
@@ -3736,16 +3696,13 @@
                                               DAG.getEVTAlignment(VT));
 
   InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
-  SDValue L = DAG.getAtomic(
-      ISD::ATOMIC_LOAD, dl, VT, VT, InChain, getValue(I.getPointerOperand()),
-      MMO, TLI.getInsertFencesForAtomic() ? Monotonic : Order, Scope);
+  SDValue L =
+      DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain,
+                    getValue(I.getPointerOperand()), MMO,
+                    Order, Scope);
 
   SDValue OutChain = L.getValue(1);
 
-  if (TLI.getInsertFencesForAtomic())
-    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
-                                    DAG, TLI);
-
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
@@ -3764,17 +3721,13 @@
   if (I.getAlignment() < VT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic store");
 
-  if (TLI.getInsertFencesForAtomic())
-    InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl, DAG, TLI);
-
-  SDValue OutChain = DAG.getAtomic(
-      ISD::ATOMIC_STORE, dl, VT, InChain, getValue(I.getPointerOperand()),
-      getValue(I.getValueOperand()), I.getPointerOperand(), I.getAlignment(),
-      TLI.getInsertFencesForAtomic() ? Monotonic : Order, Scope);
-
-  if (TLI.getInsertFencesForAtomic())
-    OutChain =
-        InsertFenceForAtomic(OutChain, Order, Scope, false, dl, DAG, TLI);
+  SDValue OutChain =
+    DAG.getAtomic(ISD::ATOMIC_STORE, dl, VT,
+                  InChain,
+                  getValue(I.getPointerOperand()),
+                  getValue(I.getValueOperand()),
+                  I.getPointerOperand(), I.getAlignment(),
+                  Order, Scope);
 
   DAG.setRoot(OutChain);
 }
Index: llvm/trunk/lib/Target/Mips/MipsTargetMachine.cpp
===================================================================
--- llvm/trunk/lib/Target/Mips/MipsTargetMachine.cpp
+++ llvm/trunk/lib/Target/Mips/MipsTargetMachine.cpp
@@ -178,6 +178,7 @@
 
 void MipsPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
+  addPass(createAtomicExpandPass(&getMipsTargetMachine()));
   if (getMipsSubtarget().os16())
     addPass(createMipsOs16(getMipsTargetMachine()));
   if (getMipsSubtarget().inMips16HardFloat())
Index: llvm/trunk/lib/Target/Sparc/SparcTargetMachine.cpp
===================================================================
--- llvm/trunk/lib/Target/Sparc/SparcTargetMachine.cpp
+++ llvm/trunk/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -47,6 +47,7 @@
     return getTM<SparcTargetMachine>();
   }
 
+  void addIRPasses() override;
   bool addInstSelector() override;
   bool addPreEmitPass() override;
 };
@@ -56,6 +57,12 @@
   return new SparcPassConfig(this, PM);
 }
 
+void SparcPassConfig::addIRPasses() {
+  addPass(createAtomicExpandPass(&getSparcTargetMachine()));
+
+  TargetPassConfig::addIRPasses();
+}
+
 bool SparcPassConfig::addInstSelector() {
   addPass(createSparcISelDag(getSparcTargetMachine()));
   return false;
Index: llvm/trunk/lib/Target/XCore/XCoreTargetMachine.cpp
===================================================================
--- llvm/trunk/lib/Target/XCore/XCoreTargetMachine.cpp
+++ llvm/trunk/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -41,6 +41,7 @@
     return getTM<XCoreTargetMachine>();
   }
 
+  void addIRPasses() override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addPreEmitPass() override;
@@ -51,6 +52,12 @@
   return new XCorePassConfig(this, PM);
 }
 
+void XCorePassConfig::addIRPasses() {
+  addPass(createAtomicExpandPass(&getXCoreTargetMachine()));
+
+  TargetPassConfig::addIRPasses();
+}
+
 bool XCorePassConfig::addPreISel() {
   addPass(createXCoreLowerThreadLocalPass());
   return false;
Index: llvm/trunk/test/CodeGen/XCore/atomic.ll
===================================================================
--- llvm/trunk/test/CodeGen/XCore/atomic.ll
+++ llvm/trunk/test/CodeGen/XCore/atomic.ll
@@ -22,11 +22,10 @@
 ; CHECK-LABEL: atomicloadstore
 
 ; CHECK: ldw r[[R0:[0-9]+]], dp[pool]
-; CHECK-NEXT: #MEMBARRIER
-  %0 = load atomic i32* bitcast (i64* @pool to i32*) acquire, align 4
-
 ; CHECK-NEXT: ldaw r[[R1:[0-9]+]], dp[pool]
+; CHECK-NEXT: #MEMBARRIER
 ; CHECK-NEXT: ldc r[[R2:[0-9]+]], 0
+  %0 = load atomic i32* bitcast (i64* @pool to i32*) acquire, align 4
 
 ; CHECK-NEXT: ld16s r3, r[[R1]][r[[R2]]]
 ; CHECK-NEXT: #MEMBARRIER