diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -716,10 +716,6 @@
     /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created.
     unsigned ARMPCLabelIndex;
 
-    // TODO: remove this, and have shouldInsertFencesForAtomic do the proper
-    // check.
-    bool InsertFencesForAtomic;
-
     bool HasStandaloneRem = true;
 
     void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -153,6 +153,17 @@
   ARM::R0, ARM::R1, ARM::R2, ARM::R3
 };
 
+// Prior to ARMv6, there were no LL/SC instructions available, which in general
+// means that lock-free atomics can't be supported. However, on some OSes,
+// kernel assistance to implement a cmpxchg operation is provided (e.g. via
+// "Restartable Atomic Sequence" on FreeBSD, or the kuser_cmpxchg function on
+// Linux).  On such OSes, we can assume functioning of lock-free __sync_* atomic
+// libcalls, regardless of the target CPU.
+static bool hasLockFreeCmpXChgHelpers(const ARMSubtarget &Subtarget) {
+  return (Subtarget.isTargetDarwin() || Subtarget.isTargetLinux() ||
+          Subtarget.isTargetFreeBSD() || Subtarget.isTargetNetBSD());
+}
+
 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
                                        MVT PromotedBitwiseVT) {
   if (VT != PromotedLdStVT) {
@@ -1279,55 +1290,47 @@
   else
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
-  // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
-  // the default expansion.
-  InsertFencesForAtomic = false;
-  if (Subtarget->hasAnyDataBarrier() &&
-      (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
-    // ATOMIC_FENCE needs custom lowering; the others should have been expanded
-    // to ldrex/strex loops already.
-    setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
-    if (!Subtarget->isThumb() || !Subtarget->isMClass())
+  if (Subtarget->hasLdrex()) {
+    // If we have ldrex, we can support native atomics. And with ldrexd, 64bit
+    // atomics.
+    if (Subtarget->hasLdrexd()) {
+      setMaxAtomicSizeInBitsSupported(64);
       setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
-
-    // On v8, we have particularly efficient implementations of atomic fences
-    // if they can be combined with nearby atomic loads and stores.
-    if (!Subtarget->hasAcquireRelease() ||
-        getTargetMachine().getOptLevel() == 0) {
-      // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
-      InsertFencesForAtomic = true;
-    }
-  } else {
-    // If there's anything we can use as a barrier, go through custom lowering
-    // for ATOMIC_FENCE.
-    // If target has DMB in thumb, Fences can be inserted.
-    if (Subtarget->hasDataBarrier())
-      InsertFencesForAtomic = true;
-
-    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
-                       Subtarget->hasAnyDataBarrier() ? Custom : Expand);
-
-    // Set them all for expansion, which will force libcalls.
-    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
+    } else
+      setMaxAtomicSizeInBitsSupported(32);
+  } else if (hasLockFreeCmpXChgHelpers(*Subtarget)) {
+    // If we're on an OS which provide kernel assistance (exposed via lock-free
+    // __sync_* libcalls), we can emit 32-bit atomic loads/stores directly, and
+    // depend on the __sync_ calls for the other operations. We don't support
+    // 64-bit in this way, because the 64-bit load/store instructions are not
+    // (always) atomic.
+
+    setMaxAtomicSizeInBitsSupported(32);
+
+    // Set everything but ATOMIC_LOAD/ATOMIC_STORE for expansion to __sync_*
+    // libcalls.
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
-    // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
-    // Unordered/Monotonic case.
-    if (!InsertFencesForAtomic) {
-      setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
-      setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
-    }
+  } else {
+    // Otherwise, native atomic support cannot be guaranteed.
+    setMaxAtomicSizeInBitsSupported(0);
   }
 
+  // If there's anything we can use as a barrier, go through custom lowering
+  // for ATOMIC_FENCE. Otherwise expand to __sync_synchronize libcall.
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
+                     Subtarget->hasAnyDataBarrier() ? Custom : Expand);
+
   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
 
   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
@@ -9711,16 +9714,6 @@
   return LowerVecReduce(Op, DAG, ST);
 }
 
-static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
-  if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
-    // Acquire/Release load/store is not legal for targets without a dmb or
-    // equivalent available.
-    return SDValue();
-
-  // Monotonic load/store is legal for all targets.
-  return Op;
-}
-
 static void ReplaceREADCYCLECOUNTER(SDNode *N,
                                     SmallVectorImpl<SDValue> &Results,
                                     SelectionDAG &DAG,
@@ -9927,8 +9920,6 @@
   case ISD::VECREDUCE_FMIN:
   case ISD::VECREDUCE_FMAX:
     return LowerVecReduceF(Op, DAG, Subtarget);
-  case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
@@ -18841,8 +18832,6 @@
   // First, if the target has no DMB, see what fallback we can use.
   if (!Subtarget->hasDataBarrier()) {
     // Some ARMv6 cpus can support data barriers with an mcr instruction.
-    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
-    // here.
     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
       Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
@@ -18850,9 +18839,10 @@
                         Builder.getInt32(10), Builder.getInt32(5)};
       return Builder.CreateCall(MCR, args);
     } else {
-      // Instead of using barriers, atomic accesses on these subtargets use
-      // libcalls.
-      llvm_unreachable("makeDMB on a target so old that it has no barriers");
+      // Instead of barriers, atomic accesses on Thumb1 and pre-v6 ARM
+      // mode just use a libcall to __sync_synchronize. So, just emit
+      // a fence instruction.
+      return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
     }
   } else {
     Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
@@ -18907,64 +18897,93 @@
   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
 }
 
-// Loads and stores less than 64-bits are already atomic; ones above that
-// are doomed anyway, so defer to the default libcall and blame the OS when
-// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
-// anything for those.
+// Loads and stores less than 64-bits are intrinsically atomic. For 64-bit
+// operations, we can replace with ldrexd/strexd. We don't need to check for its
+// availability, because when it's not available, we only support 32-bit
+// lockfree atomics.
+//
+// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
+// guarantee, see DDI0406C ARM architecture reference manual, sections
+// A8.8.72-74 LDRD); on such CPUs it would be advantageous to not expand 64-bit
+// loads and stores to LL/SC sequences.
 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
-  return (Size == 64) && !Subtarget->isMClass();
+  return Size == 64;
 }
 
-// Loads and stores less than 64-bits are already atomic; ones above that
-// are doomed anyway, so defer to the default libcall and blame the OS when
-// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
-// anything for those.
-// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
-// guarantee, see DDI0406C ARM architecture reference manual,
-// sections A8.8.72-74 LDRD)
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
-  return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
-                                                  : AtomicExpansionKind::None;
+  if (Size == 64)
+    return AtomicExpansionKind::LLOnly;
+
+  return AtomicExpansionKind::None;
 }
 
-// For the real atomic operations, we have ldrex/strex up to 32 bits,
-// and up to 64 bits on the non-M profiles
+// In the following "should*Atomic*" routines, there's two cases to consider:
+// 1. We have native atomics (hasLdrex() is true). We want to expand to LL/SC.
+//
+// 2. We don't actually have native atomics, but we pretend that we do, because
+//    we're on an OS that provides a "magic" lock-free compare-and-swap
+//    routine. In this case, we rely on __sync libcall expansions for all the
+//    operations. Thus, we avoid doing expansions in IR.
+//
+// If there's neither native atomics, nor special OS routines allowing lock-free
+// libcalls, these routines will not be called at all, because
+// MaxAtomicSizeInBitsSupported was set to 0.
+
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  // Floating-point operations are always emitted to a cmpxchg loop, because
+  // they may trigger a trap which aborts an LLSC sequence.
   if (AI->isFloatingPointOperation())
     return AtomicExpansionKind::CmpXChg;
 
-  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
-  return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
-             ? AtomicExpansionKind::LLSC
-             : AtomicExpansionKind::None;
+  if (!Subtarget->hasLdrex())
+    return AtomicExpansionKind::None;
+  return AtomicExpansionKind::LLSC;
 }
 
-// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used  up to 32
-// bits, and up to 64 bits on the non-M profiles.
+// Similar to shouldExpandAtomicRMWInIR, we use LL/SC when available, or a
+// __sync_* function if not.
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
+  if (!Subtarget->hasLdrex())
+    return AtomicExpansionKind::None;
   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
   // implement cmpxchg without spilling. If the address being exchanged is also
   // on the stack and close enough to the spill slot, this can lead to a
   // situation where the monitor always gets cleared and the atomic operation
   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
-  unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
-  bool HasAtomicCmpXchg =
-      !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
-  if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg &&
-      Size <= (Subtarget->isMClass() ? 32U : 64U))
-    return AtomicExpansionKind::LLSC;
-  return AtomicExpansionKind::None;
+  if (getTargetMachine().getOptLevel() == 0)
+    return AtomicExpansionKind::None;
+
+  return AtomicExpansionKind::LLSC;
 }
 
 bool ARMTargetLowering::shouldInsertFencesForAtomic(
     const Instruction *I) const {
-  return InsertFencesForAtomic;
+  // When we don't have ldrex, we may be emitting __sync_* libcalls. These don't
+  // need fences inserted as they already have appropriate barriers within the
+  // function. Load and Store, however, are handled directly, and thus do
+  // require fence insertion.
+  if (!Subtarget->hasLdrex()) {
+    return isa<LoadInst>(I) || isa<StoreInst>(I);
+  }
+
+  // In -O0 mode, there's a hack in place to expand ATOMIC_CMP_SWAP in a late
+  // pseudo expansion instead of in IR. This pseduo requires fences to be
+  // emitted externally.
+  if (getTargetMachine().getOptLevel() == 0 && isa<AtomicCmpXchgInst>(I))
+    return true;
+
+  // On v8, we have additional acquire/release instructions that are more
+  // efficient than a separate fence.
+  if (Subtarget->hasAcquireRelease())
+    return false;
+
+  // Otherwise, insert fences (dmb ish) around all atomic operations.
+  return true;
 }
 
 // This has so far only been implemented for MachO.
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -739,6 +739,7 @@
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
+  bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
   bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
 
   bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
@@ -857,8 +858,16 @@
   /// scheduling, DAGCombine, etc.).
   bool useAA() const override { return true; }
 
-  // enableAtomicExpand- True if we need to expand our atomics.
-  bool enableAtomicExpand() const override;
+  // True for targets that support atomic ldrex/strex instructions.
+  bool hasLdrex() const {
+    return HasV6Ops && (!InThumbMode || HasV8MBaselineOps);
+  }
+
+  // True for targets which support atomic ldrexd/strexd instructions.
+  bool hasLdrexd() const {
+    // The Cortex-M series only support 32bit atomics.
+    return hasLdrex() && !isMClass();
+  }
 
   /// getInstrItins - Return the instruction itineraries based on subtarget
   /// selection.
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -409,8 +409,6 @@
   return !isThumb1Only();
 }
 
-bool ARMSubtarget::enableAtomicExpand() const { return hasAnyDataBarrier(); }
-
 bool ARMSubtarget::useStride4VFPs() const {
   // For general targets, the prologue can grow when VFPs are allocated with
   // stride 4 (more vpush instructions). But WatchOS uses a compact unwind
diff --git a/llvm/test/CodeGen/ARM/atomic-64bit.ll b/llvm/test/CodeGen/ARM/atomic-64bit.ll
--- a/llvm/test/CodeGen/ARM/atomic-64bit.ll
+++ b/llvm/test/CodeGen/ARM/atomic-64bit.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
-; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-LE
-; RUN: llc < %s -mtriple=armebv7 -target-abi apcs | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
-; RUN: llc < %s -mtriple=thumbebv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-BE
-; RUN: llc < %s -mtriple=armv7m--none-eabi | FileCheck %s --check-prefix=CHECK-M
-; RUN: llc < %s -mtriple=armv8m--none-eabi | FileCheck %s --check-prefix=CHECK-M
+; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s --check-prefixes=COMMON,CHECK,CHECK-LE
+; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf | FileCheck %s --check-prefixes=COMMON,CHECK-THUMB,CHECK-THUMB-LE
+; RUN: llc < %s -mtriple=armebv7 -target-abi apcs | FileCheck %s --check-prefixes=COMMON,CHECK,CHECK-BE
+; RUN: llc < %s -mtriple=thumbebv7-none-linux-gnueabihf | FileCheck %s --check-prefixes=COMMON,CHECK-THUMB,CHECK-THUMB-BE
+; RUN: llc < %s -mtriple=armv7m--none-eabi | FileCheck %s --check-prefixes=COMMON,CHECK-M
+; RUN: llc < %s -mtriple=armv8m.base--none-eabi | FileCheck %s --check-prefixes=COMMON,CHECK-M
 
 define i64 @test1(i64* %ptr, i64 %val) {
-; CHECK-LABEL: test1:
+; COMMON-LABEL: test1:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK-LE: adds [[REG3:(r[0-9]?[02468])]], [[REG1]]
@@ -18,7 +18,6 @@
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test1:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB-LE: adds.w [[REG3:[a-z0-9]+]], [[REG1]]
@@ -30,14 +29,14 @@
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_fetch_and_add_8
+; CHECK-M: __atomic_fetch_add_8
 
   %r = atomicrmw add i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
 
 define i64 @test2(i64* %ptr, i64 %val) {
-; CHECK-LABEL: test2:
+; COMMON-LABEL: test2:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK-LE: subs [[REG3:(r[0-9]?[02468])]], [[REG1]]
@@ -49,7 +48,6 @@
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test2:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB-LE: subs.w [[REG3:[a-z0-9]+]], [[REG1]]
@@ -61,14 +59,14 @@
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_fetch_and_sub_8
+; CHECK-M: __atomic_fetch_sub_8
 
   %r = atomicrmw sub i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
 
 define i64 @test3(i64* %ptr, i64 %val) {
-; CHECK-LABEL: test3:
+; COMMON-LABEL: test3:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK-LE-DAG: and [[REG3:(r[0-9]?[02468])]], [[REG1]]
@@ -80,7 +78,6 @@
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test3:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB-LE-DAG: and.w [[REG3:[a-z0-9]+]], [[REG1]]
@@ -92,14 +89,14 @@
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_fetch_and_and_8
+; CHECK-M: __atomic_fetch_and_8
 
   %r = atomicrmw and i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
 
 define i64 @test4(i64* %ptr, i64 %val) {
-; CHECK-LABEL: test4:
+; COMMON-LABEL: test4:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK-LE-DAG: orr [[REG3:(r[0-9]?[02468])]], [[REG1]]
@@ -111,7 +108,6 @@
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test4:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB-LE-DAG: orr.w [[REG3:[a-z0-9]+]], [[REG1]]
@@ -123,14 +119,14 @@
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_fetch_and_or_8
+; CHECK-M: __atomic_fetch_or_8
 
   %r = atomicrmw or i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
 
 define i64 @test5(i64* %ptr, i64 %val) {
-; CHECK-LABEL: test5:
+; COMMON-LABEL: test5:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK-LE-DAG: eor [[REG3:(r[0-9]?[02468])]], [[REG1]]
@@ -142,7 +138,6 @@
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test5:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB-LE-DAG: eor.w [[REG3:[a-z0-9]+]], [[REG1]]
@@ -154,14 +149,14 @@
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_fetch_and_xor_8
+; CHECK-M: __atomic_fetch_xor_8
 
   %r = atomicrmw xor i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
 
 define i64 @test6(i64* %ptr, i64 %val) {
-; CHECK-LABEL: test6:
+; COMMON-LABEL: test6:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK: strexd {{[a-z0-9]+}}, {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
@@ -169,7 +164,6 @@
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test6:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
@@ -177,14 +171,14 @@
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_lock_test_and_set_8
+; CHECK-M: __atomic_exchange_8
 
   %r = atomicrmw xchg i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
 
 define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
-; CHECK-LABEL: test7:
+; COMMON-LABEL: test7:
 ; CHECK-DAG: mov [[VAL1LO:r[0-9]+]], r1
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK-LE-DAG: eor     [[MISMATCH_LO:.*]], [[REG1]], [[VAL1LO]]
@@ -199,7 +193,6 @@
 ; CHECK: beq
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test7:
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
 ; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
@@ -213,7 +206,7 @@
 ; CHECK-THUMB: beq
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_val_compare_and_swap_8
+; CHECK-M: __atomic_compare_exchange_8
 
   %pair = cmpxchg i64* %ptr, i64 %val1, i64 %val2 seq_cst seq_cst
   %r = extractvalue { i64, i1 } %pair, 0
@@ -223,21 +216,20 @@
 ; Compiles down to a single ldrexd, except on M class devices where ldrexd
 ; isn't supported.
 define i64 @test8(i64* %ptr) {
-; CHECK-LABEL: test8:
+; COMMON-LABEL: test8:
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK-NOT: strexd
 ; CHECK: clrex
 ; CHECK-NOT: strexd
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test8:
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB-NOT: strexd
 ; CHECK-THUMB: clrex
 ; CHECK-THUMB-NOT: strexd
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_val_compare_and_swap_8
+; CHECK-M: __atomic_load_8
 
   %r = load atomic i64, i64* %ptr seq_cst, align 8
   ret i64 %r
@@ -247,7 +239,7 @@
 ; way to write it. Except on M class devices, where ldrexd/strexd aren't
 ; supported.
 define void @test9(i64* %ptr, i64 %val) {
-; CHECK-LABEL: test9:
+; COMMON-LABEL: test9:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK: strexd {{[a-z0-9]+}}, {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
@@ -255,7 +247,6 @@
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test9:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
@@ -263,14 +254,14 @@
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_lock_test_and_set_8
+; CHECK-M: __atomic_store_8
 
   store atomic i64 %val, i64* %ptr seq_cst, align 8
   ret void
 }
 
 define i64 @test10(i64* %ptr, i64 %val) {
-; CHECK-LABEL: test10:
+; COMMON-LABEL: test10:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
@@ -289,7 +280,6 @@
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test10:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB: mov      [[OUT_LO:[a-z0-9]+]], r2
@@ -308,14 +298,14 @@
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_fetch_and_min_8
+; CHECK-M: __atomic_compare_exchange_8
 
   %r = atomicrmw min i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
 
 define i64 @test11(i64* %ptr, i64 %val) {
-; CHECK-LABEL: test11:
+; COMMON-LABEL: test11:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
@@ -334,7 +324,6 @@
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test11:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB: mov      [[OUT_LO:[a-z0-9]+]], r2
@@ -353,14 +342,14 @@
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_fetch_and_umin_8
+; CHECK-M: __atomic_compare_exchange_8
 
   %r = atomicrmw umin i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
 
 define i64 @test12(i64* %ptr, i64 %val) {
-; CHECK-LABEL: test12:
+; COMMON-LABEL: test12:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
@@ -379,7 +368,6 @@
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test12:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB: mov      [[OUT_LO:[a-z0-9]+]], r2
@@ -398,14 +386,14 @@
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_fetch_and_max_8
+; CHECK-M: __atomic_compare_exchange_8
 
   %r = atomicrmw max i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
 
 define i64 @test13(i64* %ptr, i64 %val) {
-; CHECK-LABEL: test13:
+; COMMON-LABEL: test13:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
@@ -424,7 +412,6 @@
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-; CHECK-THUMB-LABEL: test13:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB: mov      [[OUT_LO:[a-z0-9]+]], r2
@@ -443,7 +430,7 @@
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-; CHECK-M: __sync_fetch_and_umax_8
+; CHECK-M: __atomic_compare_exchange_8
 
   %r = atomicrmw umax i64* %ptr, i64 %val seq_cst
   ret i64 %r
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -1,27 +1,25 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s -check-prefix=ARM
-; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=ARM
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s -check-prefix=THUMBTWO
-; RUN: llc < %s -mtriple=thumbv6-apple-ios | FileCheck %s -check-prefix=THUMBONE
-; RUN: llc < %s -mtriple=armv4-apple-ios | FileCheck %s -check-prefix=ARMV4
-; RUN: llc < %s -mtriple=armv6-apple-ios | FileCheck %s -check-prefix=ARMV6
-; RUN: llc < %s -mtriple=thumbv7m-apple-ios | FileCheck %s -check-prefix=THUMBM
+; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,ARM
+; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefixes=CHECK,ARM
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,THUMBTWO
+; RUN: llc < %s -mtriple=thumbv6-apple-ios | FileCheck %s -check-prefixes=CHECK,THUMBONE
+; RUN: llc < %s -mtriple=armv4-apple-ios | FileCheck %s -check-prefixes=CHECK,ARMV4
+; RUN: llc < %s -mtriple=armv6-apple-ios | FileCheck %s -check-prefixes=CHECK,ARMV6
+; RUN: llc < %s -mtriple=thumbv7m-apple-ios | FileCheck %s -check-prefixes=CHECK,THUMBM
 
 define void @test1(i32* %ptr, i32 %val1) {
-; ARM-LABEL: test1
+; CHECK-LABEL: test1:
 ; ARM: dmb {{ish$}}
 ; ARM-NEXT: str
 ; ARM-NEXT: dmb {{ish$}}
-; THUMBONE-LABEL: test1
-; THUMBONE: __sync_lock_test_and_set_4
-; THUMBTWO-LABEL: test1
+; THUMBONE: ___sync_synchronize
+; THUMBONE-NEXT: str
+; THUMBONE-NEXT: ___sync_synchronize
 ; THUMBTWO: dmb {{ish$}}
 ; THUMBTWO-NEXT: str
 ; THUMBTWO-NEXT: dmb {{ish$}}
-; ARMV6-LABEL: test1
 ; ARMV6: mcr p15, #0, {{r[0-9]*}}, c7, c10, #5
 ; ARMV6: str
 ; ARMV6: mcr p15, #0, {{r[0-9]*}}, c7, c10, #5
-; THUMBM-LABEL: test1
 ; THUMBM: dmb sy
 ; THUMBM: str
 ; THUMBM: dmb sy
@@ -30,18 +28,16 @@
 }
 
 define i32 @test2(i32* %ptr) {
-; ARM-LABEL: test2
+; CHECK-LABEL: test2:
+
 ; ARM: ldr
 ; ARM-NEXT: dmb {{ish$}}
-; THUMBONE-LABEL: test2
-; THUMBONE: __sync_val_compare_and_swap_4
-; THUMBTWO-LABEL: test2
+; THUMBONE: ldr
+; THUMBONE: __sync_synchronize
 ; THUMBTWO: ldr
 ; THUMBTWO-NEXT: dmb {{ish$}}
-; ARMV6-LABEL: test2
 ; ARMV6: ldr
 ; ARMV6: mcr p15, #0, {{r[0-9]*}}, c7, c10, #5
-; THUMBM-LABEL: test2
 ; THUMBM: ldr
 ; THUMBM: dmb sy
   %val = load atomic i32, i32* %ptr seq_cst, align 4
@@ -49,7 +45,8 @@
 }
 
 define void @test3(i8* %ptr1, i8* %ptr2) {
-; ARM-LABEL: test3
+; CHECK-LABEL: test3:
+
 ; ARM-NOT: dmb
 ; ARM: ldrb
 ; ARM-NOT: dmb
@@ -57,7 +54,6 @@
 ; ARM-NOT: dmb
 ; ARM: bx lr
 
-; THUMBTWO-LABEL: test3
 ; THUMBTWO-NOT: dmb
 ; THUMBTWO: ldrb
 ; THUMBTWO-NOT: dmb
@@ -65,16 +61,16 @@
 ; THUMBTWO-NOT: dmb
 ; THUMBTWO: bx lr
 
-; THUMBONE-LABEL: test3
 ; THUMBONE-NOT: dmb
+; THUMBONE-NOT: __sync_synchronize
 ; THUMBONE: ldrb
 ; THUMBONE-NOT: dmb
+; THUMBONE-NOT: __sync_synchronize
 ; THUMBONE: strb
 ; THUMBONE-NOT: dmb
+; THUMBONE-NOT: __sync_synchronize
 
-; ARMV6-LABEL: test3
 ; ARMV6-NOT: mcr
-; THUMBM-LABEL: test3
 ; THUMBM-NOT: dmb sy
   %val = load atomic i8, i8* %ptr1 unordered, align 1
   store atomic i8 %val, i8* %ptr2 unordered, align 1
@@ -82,26 +78,30 @@
 }
 
 define void @test4(i8* %ptr1, i8* %ptr2) {
-; THUMBONE-LABEL: test4
-; THUMBONE: ___sync_val_compare_and_swap_1
-; THUMBONE: ___sync_lock_test_and_set_1
-; ARMV6-LABEL: test4
-; THUMBM-LABEL: test4
+; CHECK-LABEL: test4
+
+; THUMBONE:      ldrb
+; THUMBONE-NEXT: ___sync_synchronize
+; THUMBONE-NEXT: ___sync_synchronize
+; THUMBONE-NEXT: strb
+; THUMBONE-NEXT: ___sync_synchronize
   %val = load atomic i8, i8* %ptr1 seq_cst, align 1
   store atomic i8 %val, i8* %ptr2 seq_cst, align 1
   ret void
 }
 
 define i64 @test_old_load_64bit(i64* %p) {
-; ARMV4-LABEL: test_old_load_64bit
-; ARMV4: ___sync_val_compare_and_swap_8
+; CHECK-LABEL: test_old_load_64bit
+
+; ARMV4: ___atomic_load_8
   %1 = load atomic i64, i64* %p seq_cst, align 8
   ret i64 %1
 }
 
 define void @test_old_store_64bit(i64* %p, i64 %v) {
-; ARMV4-LABEL: test_old_store_64bit
-; ARMV4: ___sync_lock_test_and_set_8
+; CHECK-LABEL: test_old_store_64bit
+
+; ARMV4: ___atomic_store_8
   store atomic i64 %v, i64* %p seq_cst, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/ARM/atomic-op.ll b/llvm/test/CodeGen/ARM/atomic-op.ll
--- a/llvm/test/CodeGen/ARM/atomic-op.ll
+++ b/llvm/test/CodeGen/ARM/atomic-op.ll
@@ -365,8 +365,10 @@
 ; CHECK-T1-M0: ldr {{r[0-9]}}, [r1]
 ; CHECK-T1-M0: dmb
 
-; CHECK-T1: ___sync_val_compare_and_swap_4
-; CHECK-T1: ___sync_val_compare_and_swap_4
+; CHECK-T1: ldr {{r[0-9]}}, [{{r[0-9]}}]
+; CHECK-T1: __sync_synchronize
+; CHECK-T1: ldr {{r[0-9]}}, [{{r[0-9]}}]
+; CHECK-T1: __sync_synchronize
 
 ; CHECK-BAREMETAL: ldr {{r[0-9]}}, [r0]
 ; CHECK-BAREMETAL-NOT: dmb
@@ -387,8 +389,10 @@
 ; CHECK: dmb
 ; CHECK: str r3, [r2]
 
-; CHECK-T1: ___sync_lock_test_and_set
-; CHECK-T1: ___sync_lock_test_and_set
+; CHECK-T1: __sync_synchronize
+; CHECK-T1: str {{r[0-9]}}, [{{r[0-9]}}]
+; CHECK-T1: __sync_synchronize
+; CHECK-T1: str {{r[0-9]}}, [{{r[0-9]}}]
 
 ; CHECK-T1-M0: dmb
 ; CHECK-T1-M0: str r1, [r0]
diff --git a/llvm/test/CodeGen/ARM/atomic-ops-m33.ll b/llvm/test/CodeGen/ARM/atomic-ops-m33.ll
--- a/llvm/test/CodeGen/ARM/atomic-ops-m33.ll
+++ b/llvm/test/CodeGen/ARM/atomic-ops-m33.ll
@@ -71,7 +71,7 @@
 
 define void @test_atomic_load_add_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_add_i64:
-; CHECK: bl __sync_fetch_and_add_8
+; CHECK: bl __atomic_fetch_add_8
    %old = atomicrmw add i64* @var64, i64 %offset monotonic
   store i64 %old, i64* @var64
   ret void