Index: include/llvm/IR/Instruction.h
===================================================================
--- include/llvm/IR/Instruction.h
+++ include/llvm/IR/Instruction.h
@@ -338,6 +338,11 @@
     return mayReadFromMemory() || mayWriteToMemory();
   }
 
+  /// isAtomic - Return true if this instruction has an
+  /// AtomicOrdering of unordered or higher.
+  ///
+  bool isAtomic() const;
+
   /// mayThrow - Return true if this instruction may throw an exception.
   ///
   bool mayThrow() const;
Index: include/llvm/IR/Instructions.h
===================================================================
--- include/llvm/IR/Instructions.h
+++ include/llvm/IR/Instructions.h
@@ -241,7 +241,6 @@
                                (xthread << 6));
   }
 
-  bool isAtomic() const { return getOrdering() != NotAtomic; }
   void setAtomic(AtomicOrdering Ordering,
                  SynchronizationScope SynchScope = CrossThread) {
     setOrdering(Ordering);
@@ -361,7 +360,6 @@
                                (xthread << 6));
   }
 
-  bool isAtomic() const { return getOrdering() != NotAtomic; }
   void setAtomic(AtomicOrdering Ordering,
                  SynchronizationScope SynchScope = CrossThread) {
     setOrdering(Ordering);
Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -31,6 +31,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Target/TargetCallingConv.h"
@@ -935,6 +936,10 @@
   /// \name Helpers for atomic expansion.
   /// @{
 
+  /// True if AtomicExpandPass should use emitLoadLinked/emitStoreConditional
+  /// and expand AtomicCmpXchgInst.
+  virtual bool hasLoadLinkedStoreConditional() const { return false; }
+
   /// Perform a load-linked operation on Addr, returning a "Value *" with the
   /// corresponding pointee type. This may entail some non-trivial operations to
   /// truncate or reconstruct types that will be illegal in the backend. See
@@ -955,7 +960,7 @@
   /// It is called by AtomicExpandPass before expanding an
   ///   AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad.
   /// RMW and CmpXchg set both IsStore and IsLoad to true.
-  /// Backends with !getInsertFencesForAtomic() should keep a no-op here
+  /// Backends with !getInsertFencesForAtomic() should keep a no-op here.
   virtual void emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
           bool IsStore, bool IsLoad) const {
     assert(!getInsertFencesForAtomic());
@@ -965,20 +970,28 @@
   /// It is called by AtomicExpandPass after expanding an
   ///   AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad.
   /// RMW and CmpXchg set both IsStore and IsLoad to true.
-  /// Backends with !getInsertFencesForAtomic() should keep a no-op here
+  /// Backends with !getInsertFencesForAtomic() should keep a no-op here.
   virtual void emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
           bool IsStore, bool IsLoad) const {
     assert(!getInsertFencesForAtomic());
   }
 
-  /// Return true if the given (atomic) instruction should be expanded by the
-  /// IR-level AtomicExpand pass into a loop involving
-  /// load-linked/store-conditional pairs. Atomic stores will be expanded in the
-  /// same way as "atomic xchg" operations which ignore their output if needed.
-  virtual bool shouldExpandAtomicInIR(Instruction *Inst) const {
+  /// Returns true if the given (atomic) store should be expanded by the
+  /// IR-level AtomicExpand pass into an "atomic xchg" which ignores its input.
+  virtual bool shouldExpandAtomicStoreInIR(StoreInst *SI) const {
     return false;
   }
 
+  /// Returns true if the given (atomic) load should be expanded by the
+  /// IR-level AtomicExpand pass into a load-linked instruction
+  /// (through emitLoadLinked()).
+  virtual bool shouldExpandAtomicLoadInIR(LoadInst *LI) const { return false; }
+
+  /// Returns true if the given AtomicRMW should be expanded by the
+  /// IR-level AtomicExpand pass into a loop using LoadLinked/StoreConditional.
+  virtual bool shouldExpandAtomicRMWInIR(AtomicRMWInst *RMWI) const {
+    return false;
+  }
 
   //===--------------------------------------------------------------------===//
   // TargetLowering Configuration Methods - These methods should be invoked by
Index: lib/CodeGen/AtomicExpandPass.cpp
===================================================================
--- lib/CodeGen/AtomicExpandPass.cpp
+++ lib/CodeGen/AtomicExpandPass.cpp
@@ -8,13 +8,14 @@
 //===----------------------------------------------------------------------===//
 //
 // This file contains a pass (at IR level) to replace atomic instructions with
-// appropriate (intrinsic-based) ldrex/strex loops.
+// either (intrinsic-based) ldrex/strex loops or AtomicCmpXchg.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
@@ -38,15 +39,14 @@
     }
 
     bool runOnFunction(Function &F) override;
-    bool expandAtomicInsts(Function &F);
 
+  private:
     bool expandAtomicLoad(LoadInst *LI);
-    bool expandAtomicStore(StoreInst *LI);
+    bool expandAtomicStore(StoreInst *SI);
     bool expandAtomicRMW(AtomicRMWInst *AI);
+    bool expandAtomicRMWToLLSC(AtomicRMWInst *AI);
+    bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI);
     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
-
-    AtomicOrdering insertLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
-    void insertTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
   };
 }
 
@@ -63,55 +63,64 @@
 bool AtomicExpand::runOnFunction(Function &F) {
   if (!TM || !TM->getSubtargetImpl()->enableAtomicExpand())
     return false;
+  auto TargetLowering = TM->getSubtargetImpl()->getTargetLowering();
 
   SmallVector<Instruction *, 1> AtomicInsts;
 
   // Changing control-flow while iterating through it is a bad idea, so gather a
   // list of all atomic instructions before we start.
-  for (BasicBlock &BB : F)
-    for (Instruction &Inst : BB) {
-      if (isa<AtomicRMWInst>(&Inst) || isa<AtomicCmpXchgInst>(&Inst) ||
-          (isa<LoadInst>(&Inst) && cast<LoadInst>(&Inst)->isAtomic()) ||
-          (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
-        AtomicInsts.push_back(&Inst);
-    }
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    if (I->isAtomic())
+      AtomicInsts.push_back(&*I);
+  }
 
   bool MadeChange = false;
-  for (Instruction *Inst : AtomicInsts) {
-    if (!TM->getSubtargetImpl()->getTargetLowering()->shouldExpandAtomicInIR(
-            Inst))
-      continue;
+  for (auto I : AtomicInsts) {
+    auto LI = dyn_cast<LoadInst>(I);
+    auto SI = dyn_cast<StoreInst>(I);
+    auto RMWI = dyn_cast<AtomicRMWInst>(I);
+    auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
+
+    assert((LI || SI || RMWI || CASI || isa<FenceInst>(I)) &&
+           "Unknown atomic instruction");
 
-    if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
-      MadeChange |= expandAtomicRMW(AI);
-    else if (AtomicCmpXchgInst *CI = dyn_cast<AtomicCmpXchgInst>(Inst))
-      MadeChange |= expandAtomicCmpXchg(CI);
-    else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    if (LI && TargetLowering->shouldExpandAtomicLoadInIR(LI)) {
       MadeChange |= expandAtomicLoad(LI);
-    else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    } else if (SI && TargetLowering->shouldExpandAtomicStoreInIR(SI)) {
       MadeChange |= expandAtomicStore(SI);
-    else
-      llvm_unreachable("Unknown atomic instruction");
+    } else if (RMWI && TargetLowering->shouldExpandAtomicRMWInIR(RMWI)) {
+      MadeChange |= expandAtomicRMW(RMWI);
+    } else if (CASI && TargetLowering->hasLoadLinkedStoreConditional()) {
+      MadeChange |= expandAtomicCmpXchg(CASI);
+    }
   }
-
   return MadeChange;
 }
 
 bool AtomicExpand::expandAtomicLoad(LoadInst *LI) {
-  // Load instructions don't actually need a leading fence, even in the
-  // SequentiallyConsistent case.
+  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
+  // If getInsertFencesForAtomic() returns true, then the target does not want
+  // to deal with memory orders, and emitLeading/TrailingFence should take care
+  // of everything. Otherwise, emitLeading/TrailingFence are no-op and we
+  // should preserve the ordering.
   AtomicOrdering MemOpOrder =
-      TM->getSubtargetImpl()->getTargetLowering()->getInsertFencesForAtomic()
-          ? Monotonic
-          : LI->getOrdering();
+      TLI->getInsertFencesForAtomic() ? Monotonic : LI->getOrdering();
+  IRBuilder<> Builder(LI);
 
-  // The only 64-bit load guaranteed to be single-copy atomic by the ARM is
+  // Note that although no fence is required before atomic load on ARM, it is
+  // required before SequentiallyConsistent loads for the recommended Power
+  // mapping (see http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html).
+  // So we let the target choose what to emit.
+  TLI->emitLeadingFence(Builder, LI->getOrdering(),
+                        /*IsStore=*/false, /*IsLoad=*/true);
+
+  // The only 64-bit load guaranteed to be single-copy atomic by ARM is
   // an ldrexd (A3.5.3).
-  IRBuilder<> Builder(LI);
-  Value *Val = TM->getSubtargetImpl()->getTargetLowering()->emitLoadLinked(
-      Builder, LI->getPointerOperand(), MemOpOrder);
+  Value *Val =
+      TLI->emitLoadLinked(Builder, LI->getPointerOperand(), MemOpOrder);
 
-  insertTrailingFence(Builder, LI->getOrdering());
+  TLI->emitTrailingFence(Builder, LI->getOrdering(),
+                         /*IsStore=*/false, /*IsLoad=*/true);
 
   LI->replaceAllUsesWith(Val);
   LI->eraseFromParent();
@@ -120,9 +129,12 @@
 }
 
 bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
-  // The only atomic 64-bit store on ARM is an strexd that succeeds, which means
-  // we need a loop and the entire instruction is essentially an "atomicrmw
-  // xchg" that ignores the value loaded.
+  // This function is only called on atomic stores that are too large to be
+  // atomic if implemented as a native store. So we replace them by an
+  // atomic swap, that can be implemented for example as a ldrex/strex on ARM
+  // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
+  // It is the responsibility of the target to only return true in
+  // shouldExpandAtomicRMW in cases where this is required and possible.
   IRBuilder<> Builder(SI);
   AtomicRMWInst *AI =
       Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
@@ -134,11 +146,64 @@
 }
 
 bool AtomicExpand::expandAtomicRMW(AtomicRMWInst *AI) {
-  AtomicOrdering Order = AI->getOrdering();
+  if (TM->getSubtargetImpl()
+          ->getTargetLowering()
+          ->hasLoadLinkedStoreConditional())
+    return expandAtomicRMWToLLSC(AI);
+  else
+    return expandAtomicRMWToCmpXchg(AI);
+}
+
+/// Emit IR to implement the given atomicrmw operation on values in registers,
+/// returning the new value.
+static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
+                              Value *Loaded, Value *Inc) {
+  Value *NewVal;
+  switch (Op) {
+  case AtomicRMWInst::Xchg:
+    return Inc;
+  case AtomicRMWInst::Add:
+    return Builder.CreateAdd(Loaded, Inc, "new");
+  case AtomicRMWInst::Sub:
+    return Builder.CreateSub(Loaded, Inc, "new");
+  case AtomicRMWInst::And:
+    return Builder.CreateAnd(Loaded, Inc, "new");
+  case AtomicRMWInst::Nand:
+    return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
+  case AtomicRMWInst::Or:
+    return Builder.CreateOr(Loaded, Inc, "new");
+  case AtomicRMWInst::Xor:
+    return Builder.CreateXor(Loaded, Inc, "new");
+  case AtomicRMWInst::Max:
+    NewVal = Builder.CreateICmpSGT(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::Min:
+    NewVal = Builder.CreateICmpSLE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMax:
+    NewVal = Builder.CreateICmpUGT(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMin:
+    NewVal = Builder.CreateICmpULE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  default:
+    llvm_unreachable("Unknown atomic op");
+  }
+}
+
+bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
+  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
+  AtomicOrdering FenceOrder = AI->getOrdering();
   Value *Addr = AI->getPointerOperand();
   BasicBlock *BB = AI->getParent();
   Function *F = BB->getParent();
   LLVMContext &Ctx = F->getContext();
+  // If getInsertFencesForAtomic() returns true, then the target does not want
+  // to deal with memory orders, and emitLeading/TrailingFence should take care
+  // of everything. Otherwise, emitLeading/TrailingFence are no-op and we
+  // should preserve the ordering.
+  AtomicOrdering MemOpOrder =
+      TLI->getInsertFencesForAtomic() ? Monotonic : FenceOrder;
 
   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
   //
@@ -165,67 +230,24 @@
   // the branch entirely.
   std::prev(BB->end())->eraseFromParent();
   Builder.SetInsertPoint(BB);
-  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, Order);
+  TLI->emitLeadingFence(Builder, FenceOrder, /*IsStore=*/true, /*IsLoad=*/true);
   Builder.CreateBr(LoopBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
   Builder.SetInsertPoint(LoopBB);
-  Value *Loaded = TM->getSubtargetImpl()->getTargetLowering()->emitLoadLinked(
-      Builder, Addr, MemOpOrder);
+  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
 
-  Value *NewVal;
-  switch (AI->getOperation()) {
-  case AtomicRMWInst::Xchg:
-    NewVal = AI->getValOperand();
-    break;
-  case AtomicRMWInst::Add:
-    NewVal = Builder.CreateAdd(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Sub:
-    NewVal = Builder.CreateSub(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::And:
-    NewVal = Builder.CreateAnd(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Nand:
-    NewVal = Builder.CreateNot(Builder.CreateAnd(Loaded, AI->getValOperand()),
-                               "new");
-    break;
-  case AtomicRMWInst::Or:
-    NewVal = Builder.CreateOr(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Xor:
-    NewVal = Builder.CreateXor(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Max:
-    NewVal = Builder.CreateICmpSGT(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Min:
-    NewVal = Builder.CreateICmpSLE(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::UMax:
-    NewVal = Builder.CreateICmpUGT(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::UMin:
-    NewVal = Builder.CreateICmpULE(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  default:
-    llvm_unreachable("Unknown atomic op");
-  }
+  Value *NewVal =
+      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
 
   Value *StoreSuccess =
-      TM->getSubtargetImpl()->getTargetLowering()->emitStoreConditional(
-          Builder, NewVal, Addr, MemOpOrder);
+      TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
   Value *TryAgain = Builder.CreateICmpNE(
       StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
   Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
-  insertTrailingFence(Builder, Order);
+  TLI->emitTrailingFence(Builder, FenceOrder, /*IsStore=*/true, /*IsLoad=*/true);
 
   AI->replaceAllUsesWith(Loaded);
   AI->eraseFromParent();
@@ -233,13 +255,90 @@
   return true;
 }
 
+bool AtomicExpand::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI) {
+  auto TargetLowering = TM->getSubtargetImpl()->getTargetLowering();
+  AtomicOrdering FenceOrder =
+      AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
+  AtomicOrdering MemOpOrder =
+      TargetLowering->getInsertFencesForAtomic() ? Monotonic : FenceOrder;
+  Value *Addr = AI->getPointerOperand();
+  BasicBlock *BB = AI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     %init_loaded = load atomic iN* %addr
+  //     br label %loop
+  // loop:
+  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
+  //     %new = some_op iN %loaded, %incr
+  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
+  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
+  //     %success = extractvalue { iN, i1 } %pair, 1
+  //     br i1 %success, label %atomicrmw.end, label %loop
+  // atomicrmw.end:
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
+  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // This grabs the DebugLoc from AI.
+  IRBuilder<> Builder(AI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we want a load. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  TargetLowering->emitLeadingFence(Builder, FenceOrder,
+                                   /*IsStore=*/true, /*IsLoad=*/true);
+  LoadInst *InitLoaded = Builder.CreateLoad(Addr);
+  InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits());
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
+  Loaded->addIncoming(InitLoaded, BB);
+
+  Value *NewVal =
+      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+
+  Value *Pair = Builder.CreateAtomicCmpXchg(
+      Addr, Loaded, NewVal, MemOpOrder,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
+  Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+  Loaded->addIncoming(NewLoaded, LoopBB);
+
+  Value *Success = Builder.CreateExtractValue(Pair, 1, "success");
+  Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  TargetLowering->emitTrailingFence(Builder, FenceOrder,
+                                    /*IsStore=*/true, /*IsLoad=*/true);
+
+  AI->replaceAllUsesWith(NewLoaded);
+  AI->eraseFromParent();
+
+  return true;
+}
+
 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
+  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
   AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
   AtomicOrdering FailureOrder = CI->getFailureOrdering();
   Value *Addr = CI->getPointerOperand();
   BasicBlock *BB = CI->getParent();
   Function *F = BB->getParent();
   LLVMContext &Ctx = F->getContext();
+  // If getInsertFencesForAtomic() returns true, then the target does not want
+  // to deal with memory orders, and emitLeading/TrailingFence should take care
+  // of everything. Otherwise, emitLeading/TrailingFence are no-op and we
+  // should preserve the ordering.
+  AtomicOrdering MemOpOrder =
+      TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder;
 
   // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
   //
@@ -280,13 +379,13 @@
   // the branch entirely.
   std::prev(BB->end())->eraseFromParent();
   Builder.SetInsertPoint(BB);
-  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, SuccessOrder);
+  TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
+                        /*IsLoad=*/true);
   Builder.CreateBr(LoopBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
   Builder.SetInsertPoint(LoopBB);
-  Value *Loaded = TM->getSubtargetImpl()->getTargetLowering()->emitLoadLinked(
-      Builder, Addr, MemOpOrder);
+  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
   Value *ShouldStore =
       Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store");
 
@@ -295,9 +394,8 @@
   Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
 
   Builder.SetInsertPoint(TryStoreBB);
-  Value *StoreSuccess =
-      TM->getSubtargetImpl()->getTargetLowering()->emitStoreConditional(
-          Builder, CI->getNewValOperand(), Addr, MemOpOrder);
+  Value *StoreSuccess = TLI->emitStoreConditional(
+      Builder, CI->getNewValOperand(), Addr, MemOpOrder);
   StoreSuccess = Builder.CreateICmpEQ(
       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
   Builder.CreateCondBr(StoreSuccess, SuccessBB,
@@ -305,11 +403,13 @@
 
   // Make sure later instructions don't get reordered with a fence if necessary.
   Builder.SetInsertPoint(SuccessBB);
-  insertTrailingFence(Builder, SuccessOrder);
+  TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
+                         /*IsLoad=*/true);
   Builder.CreateBr(ExitBB);
 
   Builder.SetInsertPoint(FailureBB);
-  insertTrailingFence(Builder, FailureOrder);
+  TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
+                         /*IsLoad=*/true);
   Builder.CreateBr(ExitBB);
 
   // Finally, we have control-flow based knowledge of whether the cmpxchg
@@ -358,27 +458,3 @@
   CI->eraseFromParent();
   return true;
 }
-
-AtomicOrdering AtomicExpand::insertLeadingFence(IRBuilder<> &Builder,
-                                                       AtomicOrdering Ord) {
-  if (!TM->getSubtargetImpl()->getTargetLowering()->getInsertFencesForAtomic())
-    return Ord;
-
-  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    Builder.CreateFence(Release);
-
-  // The exclusive operations don't need any barrier if we're adding separate
-  // fences.
-  return Monotonic;
-}
-
-void AtomicExpand::insertTrailingFence(IRBuilder<> &Builder,
-                                              AtomicOrdering Ord) {
-  if (!TM->getSubtargetImpl()->getTargetLowering()->getInsertFencesForAtomic())
-    return;
-
-  if (Ord == Acquire || Ord == AcquireRelease)
-    Builder.CreateFence(Acquire);
-  else if (Ord == SequentiallyConsistent)
-    Builder.CreateFence(SequentiallyConsistent);
-}
Index: lib/IR/Instruction.cpp
===================================================================
--- lib/IR/Instruction.cpp
+++ lib/IR/Instruction.cpp
@@ -443,6 +443,21 @@
   }
 }
 
+bool Instruction::isAtomic() const {
+  switch (getOpcode()) {
+  default:
+    return false;
+  case Instruction::AtomicCmpXchg:
+  case Instruction::AtomicRMW:
+  case Instruction::Fence:
+    return true;
+  case Instruction::Load:
+    return cast<LoadInst>(this)->getOrdering() != NotAtomic;
+  case Instruction::Store:
+    return cast<StoreInst>(this)->getOrdering() != NotAtomic;
+  }
+}
+
 bool Instruction::mayThrow() const {
   if (const CallInst *CI = dyn_cast<CallInst>(this))
     return !CI->doesNotThrow();
Index: lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
+++ lib/Target/AArch64/AArch64ISelLowering.h
@@ -317,12 +317,15 @@
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                          Type *Ty) const override;
 
+  bool hasLoadLinkedStoreConditional() const override;
   Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                               Value *Addr, AtomicOrdering Ord) const override;
 
-  bool shouldExpandAtomicInIR(Instruction *Inst) const override;
+  bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+  bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+  bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 
   bool useLoadStackGuardNode() const override;
   TargetLoweringBase::LegalizeTypeAction
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8513,19 +8513,6 @@
   }
 }
 
-bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
-  // Loads and stores less than 128-bits are already atomic; ones above that
-  // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong:
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
-  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-    return LI->getType()->getPrimitiveSizeInBits() == 128;
-
-  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
-  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
-}
-
 bool AArch64TargetLowering::useLoadStackGuardNode() const {
   return true;
 }
@@ -8542,6 +8529,32 @@
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+  return Size == 128;
+}
+
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+  return Size == 128;
+}
+
+// For the real atomic operations, we have ldxr/stxr up to 128 bits,
+bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+  return Size <= 128;
+}
+
+bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const {
+  return true;
+}
+
 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                              AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Index: lib/Target/ARM/ARMISelLowering.h
===================================================================
--- lib/Target/ARM/ARMISelLowering.h
+++ lib/Target/ARM/ARMISelLowering.h
@@ -392,12 +392,20 @@
     bool functionArgumentNeedsConsecutiveRegisters(
         Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
 
+    bool hasLoadLinkedStoreConditional() const override;
     Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                           AtomicOrdering Ord) const override;
     Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                                 Value *Addr, AtomicOrdering Ord) const override;
 
-    bool shouldExpandAtomicInIR(Instruction *Inst) const override;
+    void emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+                          bool IsStore, bool IsLoad) const override;
+    void emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+                           bool IsStore, bool IsLoad) const override;
+
+    bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 
     bool useLoadStackGuardNode() const override;
 
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -2723,7 +2723,7 @@
 
   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
-  unsigned Domain = ARM_MB::ISH;
+  ARM_MB::MemBOpt Domain = ARM_MB::ISH;
   if (Subtarget->isMClass()) {
     // Only a full system barrier exists in the M-class architectures.
     Domain = ARM_MB::SY;
@@ -10982,23 +10982,88 @@
   return true;
 }
 
-bool ARMTargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
-  // Loads and stores less than 64-bits are already atomic; ones above that
-  // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
-  // anything for those.
-  bool IsMClass = Subtarget->isMClass();
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-    unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
-    return Size == 64 && !IsMClass;
-  } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-    return LI->getType()->getPrimitiveSizeInBits() == 64 && !IsMClass;
+bool ARMTargetLowering::hasLoadLinkedStoreConditional() const { return true; }
+
+static void makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
+  Constant *CDomain = Builder.getInt32(Domain);
+  Builder.CreateCall(DMB, CDomain);
+}
+
+// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+void ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                         AtomicOrdering Ord, bool IsStore,
+                                         bool IsLoad) const {
+  if (!getInsertFencesForAtomic())
+    return;
+
+  switch (Ord) {
+  case NotAtomic:
+  case Unordered:
+    llvm_unreachable("Invalid fence: unordered/non-atomic");
+  case Monotonic:
+  case Acquire:
+    return; // Nothing to do
+  case SequentiallyConsistent:
+    if (!IsStore)
+      return; // Nothing to do
+              /*FALLTHROUGH*/
+  case Release:
+  case AcquireRelease:
+    if (Subtarget->isSwift())
+      makeDMB(Builder, ARM_MB::ISHST);
+    // FIXME: add a comment with a link to documentation justifying this.
+    else
+      makeDMB(Builder, ARM_MB::ISH);
+    return;
   }
+}
+
+void ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                          AtomicOrdering Ord, bool IsStore,
+                                          bool IsLoad) const {
+  if (!getInsertFencesForAtomic())
+    return;
+
+  switch (Ord) {
+  case NotAtomic:
+  case Unordered:
+    llvm_unreachable("Invalid fence: unordered/not-atomic");
+  case Monotonic:
+  case Release:
+    return; // Nothing to do
+  case Acquire:
+  case AcquireRelease:
+    case SequentiallyConsistent:
+    makeDMB(Builder, ARM_MB::ISH);
+    return;
+  }
+}
+
+// Loads and stores less than 64-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
+// anything for those.
+bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+  return (Size == 64) && !Subtarget->isMClass();
+}
+
+// Loads and stores less than 64-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
+// anything for those.
+bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+  return (Size == 64) && !Subtarget->isMClass();
+}
 
-  // For the real atomic operations, we have ldrex/strex up to 32 bits,
-  // and up to 64 bits on the non-M profiles
-  unsigned AtomicLimit = IsMClass ? 32 : 64;
-  return Inst->getType()->getPrimitiveSizeInBits() <= AtomicLimit;
+// For the real atomic operations, we have ldrex/strex up to 32 bits,
+// and up to 64 bits on the non-M profiles
+bool ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+  return Size <= (Subtarget->isMClass() ? 32 : 64);
 }
 
 // This has so far only been implemented for MachO.
Index: lib/Target/X86/CMakeLists.txt
===================================================================
--- lib/Target/X86/CMakeLists.txt
+++ lib/Target/X86/CMakeLists.txt
@@ -14,7 +14,6 @@
 
 set(sources
   X86AsmPrinter.cpp
-  X86AtomicExpandPass.cpp
   X86FastISel.cpp
   X86FloatingPoint.cpp
   X86FrameLowering.cpp
Index: lib/Target/X86/X86.h
===================================================================
--- lib/Target/X86/X86.h
+++ lib/Target/X86/X86.h
@@ -23,10 +23,6 @@
 class ImmutablePass;
 class X86TargetMachine;
 
-/// createX86AtomicExpandPass - This pass expands atomic operations that cannot
-/// be handled natively in terms of a loop using cmpxchg.
-FunctionPass *createX86AtomicExpandPass(const X86TargetMachine *TM);
-
 /// createX86ISelDag - This pass converts a legalized DAG into a
 /// X86-specific DAG, ready for instruction scheduling.
 ///
Index: lib/Target/X86/X86AtomicExpandPass.cpp
===================================================================
--- lib/Target/X86/X86AtomicExpandPass.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-//===-- X86AtomicExpandPass.cpp - Expand illegal atomic instructions --0---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass (at IR level) to replace atomic instructions which
-// cannot be implemented as a single instruction with cmpxchg-based loops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86TargetMachine.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-atomic-expand"
-
-namespace {
-  class X86AtomicExpandPass : public FunctionPass {
-    const X86TargetMachine *TM;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit X86AtomicExpandPass(const X86TargetMachine *TM)
-      : FunctionPass(ID), TM(TM) {}
-
-    bool runOnFunction(Function &F) override;
-    bool expandAtomicInsts(Function &F);
-
-    bool needsCmpXchgNb(Type *MemType);
-
-    /// There are four kinds of atomic operations. Two never need expanding:
-    /// cmpxchg is what we expand the others *to*, and loads are easily handled
-    /// by ISelLowering. Atomicrmw and store can need expanding in some
-    /// circumstances.
-    bool shouldExpand(Instruction *Inst);
-
-    /// 128-bit atomic stores (64-bit on i686) need to be implemented in terms
-    /// of trivial cmpxchg16b loops. A simple store isn't necessarily atomic.
-    bool shouldExpandStore(StoreInst *SI);
-
-    /// Only some atomicrmw instructions need expanding -- some operations
-    /// (e.g. max) have absolutely no architectural support; some (e.g. or) have
-    /// limited support but can't return the previous value; some (e.g. add)
-    /// have complete support in the instruction set.
-    ///
-    /// Also, naturally, 128-bit operations always need to be expanded.
-    bool shouldExpandAtomicRMW(AtomicRMWInst *AI);
-
-    bool expandAtomicRMW(AtomicRMWInst *AI);
-    bool expandAtomicStore(StoreInst *SI);
-  };
-}
-
-char X86AtomicExpandPass::ID = 0;
-
-FunctionPass *llvm::createX86AtomicExpandPass(const X86TargetMachine *TM) {
-  return new X86AtomicExpandPass(TM);
-}
-
-bool X86AtomicExpandPass::runOnFunction(Function &F) {
-  SmallVector<Instruction *, 1> AtomicInsts;
-
-  // Changing control-flow while iterating through it is a bad idea, so gather a
-  // list of all atomic instructions before we start.
-  for (BasicBlock &BB : F)
-    for (Instruction &Inst : BB) {
-      if (isa<AtomicRMWInst>(&Inst) ||
-          (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
-        AtomicInsts.push_back(&Inst);
-    }
-
-  bool MadeChange = false;
-  for (Instruction *Inst : AtomicInsts) {
-    if (!shouldExpand(Inst))
-      continue;
-
-    if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
-      MadeChange |= expandAtomicRMW(AI);
-    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-      MadeChange |= expandAtomicStore(SI);
-
-    assert(MadeChange && "Atomic inst not expanded when it should be?");
-    Inst->eraseFromParent();
-  }
-
-  return MadeChange;
-}
-
-/// Returns true if the operand type is 1 step up from the native width, and
-/// the corresponding cmpxchg8b or cmpxchg16b instruction is available
-/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
-bool X86AtomicExpandPass::needsCmpXchgNb(llvm::Type *MemType) {
-  const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>();
-  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
-
-  if (OpWidth == 64)
-    return !Subtarget.is64Bit();  // FIXME this should be Subtarget.hasCmpxchg8b
-  if (OpWidth == 128)
-    return Subtarget.hasCmpxchg16b();
-
-  return false;
-}
-
-bool X86AtomicExpandPass::shouldExpandAtomicRMW(AtomicRMWInst *AI) {
-  const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>();
-  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
-
-  if (needsCmpXchgNb(AI->getType()))
-    return true;
-
-  if (AI->getType()->getPrimitiveSizeInBits() > NativeWidth)
-    return false;
-
-  AtomicRMWInst::BinOp Op = AI->getOperation();
-  switch (Op) {
-  default:
-    llvm_unreachable("Unknown atomic operation");
-  case AtomicRMWInst::Xchg:
-  case AtomicRMWInst::Add:
-  case AtomicRMWInst::Sub:
-    // It's better to use xadd, xsub or xchg for these in all cases.
-    return false;
-  case AtomicRMWInst::Or:
-  case AtomicRMWInst::And:
-  case AtomicRMWInst::Xor:
-    // If the atomicrmw's result isn't actually used, we can just add a "lock"
-    // prefix to a normal instruction for these operations.
-    return !AI->use_empty();
-  case AtomicRMWInst::Nand:
-  case AtomicRMWInst::Max:
-  case AtomicRMWInst::Min:
-  case AtomicRMWInst::UMax:
-  case AtomicRMWInst::UMin:
-    // These always require a non-trivial set of data operations on x86. We must
-    // use a cmpxchg loop.
-    return true;
-  }
-}
-
-bool X86AtomicExpandPass::shouldExpandStore(StoreInst *SI) {
-  if (needsCmpXchgNb(SI->getValueOperand()->getType()))
-    return true;
-
-  return false;
-}
-
-bool X86AtomicExpandPass::shouldExpand(Instruction *Inst) {
-  if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
-    return shouldExpandAtomicRMW(AI);
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return shouldExpandStore(SI);
-  return false;
-}
-
-/// Emit IR to implement the given atomicrmw operation on values in registers,
-/// returning the new value.
-static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
-                              Value *Loaded, Value *Inc) {
-  Value *NewVal;
-  switch (Op) {
-  case AtomicRMWInst::Xchg:
-    return Inc;
-  case AtomicRMWInst::Add:
-    return Builder.CreateAdd(Loaded, Inc, "new");
-  case AtomicRMWInst::Sub:
-    return Builder.CreateSub(Loaded, Inc, "new");
-  case AtomicRMWInst::And:
-    return Builder.CreateAnd(Loaded, Inc, "new");
-  case AtomicRMWInst::Nand:
-    return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
-  case AtomicRMWInst::Or:
-    return Builder.CreateOr(Loaded, Inc, "new");
-  case AtomicRMWInst::Xor:
-    return Builder.CreateXor(Loaded, Inc, "new");
-  case AtomicRMWInst::Max:
-    NewVal = Builder.CreateICmpSGT(Loaded, Inc);
-    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
-  case AtomicRMWInst::Min:
-    NewVal = Builder.CreateICmpSLE(Loaded, Inc);
-    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
-  case AtomicRMWInst::UMax:
-    NewVal = Builder.CreateICmpUGT(Loaded, Inc);
-    return  Builder.CreateSelect(NewVal, Loaded, Inc, "new");
-  case AtomicRMWInst::UMin:
-    NewVal = Builder.CreateICmpULE(Loaded, Inc);
-    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
-  default:
-    break;
-  }
-  llvm_unreachable("Unknown atomic op");
-}
-
-bool X86AtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) {
-  AtomicOrdering Order =
-      AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
-  Value *Addr = AI->getPointerOperand();
-  BasicBlock *BB = AI->getParent();
-  Function *F = BB->getParent();
-  LLVMContext &Ctx = F->getContext();
-
-  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
-  //
-  // The standard expansion we produce is:
-  //     [...]
-  //     %init_loaded = load atomic iN* %addr
-  //     br label %loop
-  // loop:
-  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
-  //     %new = some_op iN %loaded, %incr
-  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
-  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
-  //     %success = extractvalue { iN, i1 } %pair, 1
-  //     br i1 %success, label %atomicrmw.end, label %loop
-  // atomicrmw.end:
-  //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
-  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
-
-  // This grabs the DebugLoc from AI.
-  IRBuilder<> Builder(AI);
-
-  // The split call above "helpfully" added a branch at the end of BB (to the
-  // wrong place), but we want a load. It's easiest to just remove
-  // the branch entirely.
-  std::prev(BB->end())->eraseFromParent();
-  Builder.SetInsertPoint(BB);
-  LoadInst *InitLoaded = Builder.CreateLoad(Addr);
-  InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits());
-  Builder.CreateBr(LoopBB);
-
-  // Start the main loop block now that we've taken care of the preliminaries.
-  Builder.SetInsertPoint(LoopBB);
-  PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
-  Loaded->addIncoming(InitLoaded, BB);
-
-  Value *NewVal =
-      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
-
-  Value *Pair = Builder.CreateAtomicCmpXchg(
-      Addr, Loaded, NewVal, Order,
-      AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
-  Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
-  Loaded->addIncoming(NewLoaded, LoopBB);
-
-  Value *Success = Builder.CreateExtractValue(Pair, 1, "success");
-  Builder.CreateCondBr(Success, ExitBB, LoopBB);
-
-  AI->replaceAllUsesWith(NewLoaded);
-
-  return true;
-}
-
-bool X86AtomicExpandPass::expandAtomicStore(StoreInst *SI) {
-  // An atomic store might need cmpxchg16b (or 8b on x86) to execute. Express
-  // this in terms of the usual expansion to "atomicrmw xchg".
-  IRBuilder<> Builder(SI);
-  AtomicOrdering Order =
-      SI->getOrdering() == Unordered ? Monotonic : SI->getOrdering();
-  AtomicRMWInst *AI =
-      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
-                              SI->getValueOperand(), Order);
-
-  // Now we have an appropriate swap instruction, lower it as usual.
-  if (shouldExpandAtomicRMW(AI)) {
-    expandAtomicRMW(AI);
-    AI->eraseFromParent();
-    return true;
-  }
-
-  return AI;
-}
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -960,6 +960,12 @@
 
     const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
+    bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
+    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+    bool needsCmpXchgNb(const Type *MemType) const;
+
     /// Utility function to emit atomic-load-arith operations (and, or, xor,
     /// nand, max, min, umax, umin). It takes the corresponding instruction to
     /// expand, the associated machine basic block, and the associated X86
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -16612,6 +16612,68 @@
   }
 }
 
+/// Returns true if the operand type is 1 step up from the native width, and
+/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
+/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
+/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
+bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
+  const X86Subtarget &Subtarget =
+      getTargetMachine().getSubtarget<X86Subtarget>();
+  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
+
+  if (OpWidth == 64)
+    return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+  else if (OpWidth == 128)
+    return Subtarget.hasCmpxchg16b();
+  else
+    return false;
+}
+
+bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  return needsCmpXchgNb(SI->getValueOperand()->getType());
+}
+
+bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *SI) const {
+  return false; // FIXME, currently these are expanded separately in this file.
+}
+
+bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  const X86Subtarget &Subtarget =
+      getTargetMachine().getSubtarget<X86Subtarget>();
+  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+  const Type *MemType = AI->getType();
+
+  // If the operand is too big, we must see if cmpxchg8/16b is available
+  // and default to library calls otherwise.
+  if (MemType->getPrimitiveSizeInBits() > NativeWidth)
+    return needsCmpXchgNb(MemType);
+
+  AtomicRMWInst::BinOp Op = AI->getOperation();
+  switch (Op) {
+  default:
+    llvm_unreachable("Unknown atomic operation");
+  case AtomicRMWInst::Xchg:
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::Sub:
+    // It's better to use xadd, xsub or xchg for these in all cases.
+    return false;
+  case AtomicRMWInst::Or:
+  case AtomicRMWInst::And:
+  case AtomicRMWInst::Xor:
+    // If the atomicrmw's result isn't actually used, we can just add a "lock"
+    // prefix to a normal instruction for these operations.
+    return !AI->use_empty();
+  case AtomicRMWInst::Nand:
+  case AtomicRMWInst::Max:
+  case AtomicRMWInst::Min:
+  case AtomicRMWInst::UMax:
+  case AtomicRMWInst::UMin:
+    // These always require a non-trivial set of data operations on x86. We must
+    // use a cmpxchg loop.
+    return true;
+  }
+}
+
 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
                                  SelectionDAG &DAG) {
   SDLoc dl(Op);
@@ -17131,7 +17193,7 @@
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
     // Delegate to generic TypeLegalization. Situations we can really handle
-    // should have already been dealt with by X86AtomicExpandPass.cpp.
+    // should have already been dealt with by AtomicExpandPass.cpp.
     break;
   case ISD::ATOMIC_LOAD: {
     ReplaceATOMIC_LOAD(N, Results, DAG);
Index: lib/Target/X86/X86TargetMachine.cpp
===================================================================
--- lib/Target/X86/X86TargetMachine.cpp
+++ lib/Target/X86/X86TargetMachine.cpp
@@ -105,7 +105,7 @@
 }
 
 void X86PassConfig::addIRPasses() {
-  addPass(createX86AtomicExpandPass(&getX86TargetMachine()));
+  addPass(createAtomicExpandPass(&getX86TargetMachine()));
 
   TargetPassConfig::addIRPasses();
 }
Index: test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll
===================================================================
--- test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll
+++ test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll
@@ -2,7 +2,7 @@
 
 define i8 @test_atomic_xchg_i8(i8* %ptr, i8 %xchgend) {
 ; CHECK-LABEL: @test_atomic_xchg_i8
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK: br label %[[LOOP:.*]]
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
@@ -12,7 +12,7 @@
 ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
 ; CHECK: [[END]]:
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK: ret i8 [[OLDVAL]]
   %res = atomicrmw xchg i8* %ptr, i8 %xchgend monotonic
   ret i8 %res
@@ -20,7 +20,7 @@
 
 define i16 @test_atomic_add_i16(i16* %ptr, i16 %addend) {
 ; CHECK-LABEL: @test_atomic_add_i16
-; CHECK: fence release
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[LOOP:.*]]
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
@@ -31,7 +31,7 @@
 ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
 ; CHECK: [[END]]:
-; CHECK: fence seq_cst
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: ret i16 [[OLDVAL]]
   %res = atomicrmw add i16* %ptr, i16 %addend seq_cst
   ret i16 %res
@@ -39,7 +39,7 @@
 
 define i32 @test_atomic_sub_i32(i32* %ptr, i32 %subend) {
 ; CHECK-LABEL: @test_atomic_sub_i32
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK: br label %[[LOOP:.*]]
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %ptr)
@@ -48,7 +48,7 @@
 ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
 ; CHECK: [[END]]:
-; CHECK: fence acquire
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: ret i32 [[OLDVAL]]
   %res = atomicrmw sub i32* %ptr, i32 %subend acquire
   ret i32 %res
@@ -56,7 +56,7 @@
 
 define i8 @test_atomic_and_i8(i8* %ptr, i8 %andend) {
 ; CHECK-LABEL: @test_atomic_and_i8
-; CHECK: fence release
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[LOOP:.*]]
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
@@ -67,7 +67,7 @@
 ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
 ; CHECK: [[END]]:
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK: ret i8 [[OLDVAL]]
   %res = atomicrmw and i8* %ptr, i8 %andend release
   ret i8 %res
@@ -75,7 +75,7 @@
 
 define i16 @test_atomic_nand_i16(i16* %ptr, i16 %nandend) {
 ; CHECK-LABEL: @test_atomic_nand_i16
-; CHECK: fence release
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[LOOP:.*]]
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
@@ -87,7 +87,7 @@
 ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
 ; CHECK: [[END]]:
-; CHECK: fence seq_cst
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: ret i16 [[OLDVAL]]
   %res = atomicrmw nand i16* %ptr, i16 %nandend seq_cst
   ret i16 %res
@@ -95,7 +95,7 @@
 
 define i64 @test_atomic_or_i64(i64* %ptr, i64 %orend) {
 ; CHECK-LABEL: @test_atomic_or_i64
-; CHECK: fence release
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[LOOP:.*]]
 ; CHECK: [[LOOP]]:
 ; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
@@ -115,7 +115,7 @@
 ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
 ; CHECK: [[END]]:
-; CHECK: fence seq_cst
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: ret i64 [[OLDVAL]]
   %res = atomicrmw or i64* %ptr, i64 %orend seq_cst
   ret i64 %res
@@ -123,7 +123,7 @@
 
 define i8 @test_atomic_xor_i8(i8* %ptr, i8 %xorend) {
 ; CHECK-LABEL: @test_atomic_xor_i8
-; CHECK: fence release
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[LOOP:.*]]
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
@@ -134,7 +134,7 @@
 ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
 ; CHECK: [[END]]:
-; CHECK: fence seq_cst
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: ret i8 [[OLDVAL]]
   %res = atomicrmw xor i8* %ptr, i8 %xorend seq_cst
   ret i8 %res
@@ -142,7 +142,7 @@
 
 define i8 @test_atomic_max_i8(i8* %ptr, i8 %maxend) {
 ; CHECK-LABEL: @test_atomic_max_i8
-; CHECK: fence release
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[LOOP:.*]]
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
@@ -154,7 +154,7 @@
 ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
 ; CHECK: [[END]]:
-; CHECK: fence seq_cst
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: ret i8 [[OLDVAL]]
   %res = atomicrmw max i8* %ptr, i8 %maxend seq_cst
   ret i8 %res
@@ -162,7 +162,7 @@
 
 define i8 @test_atomic_min_i8(i8* %ptr, i8 %minend) {
 ; CHECK-LABEL: @test_atomic_min_i8
-; CHECK: fence release
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[LOOP:.*]]
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
@@ -174,7 +174,7 @@
 ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
 ; CHECK: [[END]]:
-; CHECK: fence seq_cst
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: ret i8 [[OLDVAL]]
   %res = atomicrmw min i8* %ptr, i8 %minend seq_cst
   ret i8 %res
@@ -182,7 +182,7 @@
 
 define i8 @test_atomic_umax_i8(i8* %ptr, i8 %umaxend) {
 ; CHECK-LABEL: @test_atomic_umax_i8
-; CHECK: fence release
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[LOOP:.*]]
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
@@ -194,7 +194,7 @@
 ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
 ; CHECK: [[END]]:
-; CHECK: fence seq_cst
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: ret i8 [[OLDVAL]]
   %res = atomicrmw umax i8* %ptr, i8 %umaxend seq_cst
   ret i8 %res
@@ -202,7 +202,7 @@
 
 define i8 @test_atomic_umin_i8(i8* %ptr, i8 %uminend) {
 ; CHECK-LABEL: @test_atomic_umin_i8
-; CHECK: fence release
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[LOOP:.*]]
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
@@ -214,7 +214,7 @@
 ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
 ; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
 ; CHECK: [[END]]:
-; CHECK: fence seq_cst
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: ret i8 [[OLDVAL]]
   %res = atomicrmw umin i8* %ptr, i8 %uminend seq_cst
   ret i8 %res
@@ -222,7 +222,7 @@
 
 define i8 @test_cmpxchg_i8_seqcst_seqcst(i8* %ptr, i8 %desired, i8 %newval) {
 ; CHECK-LABEL: @test_cmpxchg_i8_seqcst_seqcst
-; CHECK: fence release
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[LOOP:.*]]
 
 ; CHECK: [[LOOP]]:
@@ -238,11 +238,11 @@
 ; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
 ; CHECK: [[SUCCESS_BB]]:
-; CHECK: fence seq_cst
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[DONE:.*]]
 
 ; CHECK: [[FAILURE_BB]]:
-; CHECK: fence seq_cst
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[DONE]]
 
 ; CHECK: [[DONE]]:
@@ -256,7 +256,7 @@
 
 define i16 @test_cmpxchg_i16_seqcst_monotonic(i16* %ptr, i16 %desired, i16 %newval) {
 ; CHECK-LABEL: @test_cmpxchg_i16_seqcst_monotonic
-; CHECK: fence release
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[LOOP:.*]]
 
 ; CHECK: [[LOOP]]:
@@ -272,11 +272,11 @@
 ; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
 ; CHECK: [[SUCCESS_BB]]:
-; CHECK: fence seq_cst
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[DONE:.*]]
 
 ; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK: br label %[[DONE]]
 
 ; CHECK: [[DONE]]:
@@ -290,7 +290,7 @@
 
 define i32 @test_cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %desired, i32 %newval) {
 ; CHECK-LABEL: @test_cmpxchg_i32_acquire_acquire
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK: br label %[[LOOP:.*]]
 
 ; CHECK: [[LOOP]]:
@@ -304,11 +304,11 @@
 ; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
 ; CHECK: [[SUCCESS_BB]]:
-; CHECK: fence acquire
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[DONE:.*]]
 
 ; CHECK: [[FAILURE_BB]]:
-; CHECK: fence acquire
+; CHECK: call void @llvm.arm.dmb(i32 11)
 ; CHECK: br label %[[DONE]]
 
 ; CHECK: [[DONE]]:
@@ -322,7 +322,7 @@
 
 define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %newval) {
 ; CHECK-LABEL: @test_cmpxchg_i64_monotonic_monotonic
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK: br label %[[LOOP:.*]]
 
 ; CHECK: [[LOOP]]:
@@ -347,11 +347,11 @@
 ; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
 ; CHECK: [[SUCCESS_BB]]:
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK: br label %[[DONE:.*]]
 
 ; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK: br label %[[DONE]]
 
 ; CHECK: [[DONE]]:
Index: test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll
===================================================================
--- test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll
+++ test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll
@@ -2,7 +2,8 @@
 
 define i32 @test_cmpxchg_seq_cst(i32* %addr, i32 %desired, i32 %new) {
 ; CHECK-LABEL: @test_cmpxchg_seq_cst
-; CHECK:     fence release
+; Intrinsic for "dmb ishst" is then expected
+; CHECK:     call void @llvm.arm.dmb(i32 10)
 ; CHECK:     br label %[[START:.*]]
 
 ; CHECK: [[START]]:
@@ -16,11 +17,11 @@
 ; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB]]
 
 ; CHECK: [[SUCCESS_BB]]:
-; CHECK:     fence seq_cst
+; CHECK:     call void @llvm.arm.dmb(i32 11)
 ; CHECK:     br label %[[END:.*]]
 
 ; CHECK: [[FAILURE_BB]]:
-; CHECK:     fence seq_cst
+; CHECK:     call void @llvm.arm.dmb(i32 11)
 ; CHECK:     br label %[[END]]
 
 ; CHECK: [[END]]:
@@ -34,7 +35,7 @@
 
 define i1 @test_cmpxchg_weak_fail(i32* %addr, i32 %desired, i32 %new) {
 ; CHECK-LABEL: @test_cmpxchg_weak_fail
-; CHECK:     fence release
+; CHECK:     call void @llvm.arm.dmb(i32 10)
 ; CHECK:     br label %[[START:.*]]
 
 ; CHECK: [[START]]:
@@ -48,11 +49,11 @@
 ; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[SUCCESS_BB]]:
-; CHECK:     fence seq_cst
+; CHECK:     call void @llvm.arm.dmb(i32 11)
 ; CHECK:     br label %[[END:.*]]
 
 ; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK:     br label %[[END]]
 
 ; CHECK: [[END]]:
@@ -66,7 +67,7 @@
 
 define i32 @test_cmpxchg_monotonic(i32* %addr, i32 %desired, i32 %new) {
 ; CHECK-LABEL: @test_cmpxchg_monotonic
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK:     br label %[[START:.*]]
 
 ; CHECK: [[START]]:
@@ -80,11 +81,11 @@
 ; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[SUCCESS_BB]]:
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK:     br label %[[END:.*]]
 
 ; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: fence
+; CHECK-NOT: dmb
 ; CHECK:     br label %[[END]]
 
 ; CHECK: [[END]]: