Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -165,6 +165,13 @@
   /// \brief Initialize all of the actions to default values.
   void initActions();
 
+  /// Allow lowering into __sync_* libcalls. Without calling this, the
+  /// __sync calls do not have names defined, and attempting to use
+  /// them from your backend will result in an error. (These must be
+  /// enabled explicitly only in order to avoid them being generated
+  /// accidentally on targets that don't support them.)
+  void initSyncLibcalls();
+
 public:
   const TargetMachine &getTargetMachine() const { return TM; }
 
Index: include/llvm/Target/TargetSubtargetInfo.h
===================================================================
--- include/llvm/Target/TargetSubtargetInfo.h
+++ include/llvm/Target/TargetSubtargetInfo.h
@@ -144,9 +144,6 @@
   /// which is the preferred way to influence this.
   virtual bool enablePostRAScheduler() const;
 
-  /// \brief True if the subtarget should run the atomic expansion pass.
-  virtual bool enableAtomicExpand() const;
-
   /// \brief Override generic scheduling policy within a region.
   ///
   /// This is a convenient way for targets that don't provide any custom
Index: lib/CodeGen/AtomicExpandPass.cpp
===================================================================
--- lib/CodeGen/AtomicExpandPass.cpp
+++ lib/CodeGen/AtomicExpandPass.cpp
@@ -173,7 +173,7 @@
 } // end anonymous namespace
 
 bool AtomicExpand::runOnFunction(Function &F) {
-  if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
+  if (!TM)
     return false;
   TLI = TM->getSubtargetImpl(F)->getTargetLowering();
 
Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2795,33 +2795,6 @@
     Results.push_back(DAG.getConstant(0, dl, MVT::i32));
     Results.push_back(Node->getOperand(0));
     break;
-  case ISD::ATOMIC_LOAD: {
-    // There is no libcall for atomic load; fake it with ATOMIC_CMP_SWAP.
-    SDValue Zero = DAG.getConstant(0, dl, Node->getValueType(0));
-    SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
-    SDValue Swap = DAG.getAtomicCmpSwap(
-        ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
-        Node->getOperand(0), Node->getOperand(1), Zero, Zero,
-        cast<AtomicSDNode>(Node)->getMemOperand(),
-        cast<AtomicSDNode>(Node)->getOrdering(),
-        cast<AtomicSDNode>(Node)->getOrdering(),
-        cast<AtomicSDNode>(Node)->getSynchScope());
-    Results.push_back(Swap.getValue(0));
-    Results.push_back(Swap.getValue(1));
-    break;
-  }
-  case ISD::ATOMIC_STORE: {
-    // There is no libcall for atomic store; fake it with ATOMIC_SWAP.
-    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
-                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
-                                 Node->getOperand(0),
-                                 Node->getOperand(1), Node->getOperand(2),
-                                 cast<AtomicSDNode>(Node)->getMemOperand(),
-                                 cast<AtomicSDNode>(Node)->getOrdering(),
-                                 cast<AtomicSDNode>(Node)->getSynchScope());
-    Results.push_back(Swap.getValue(1));
-    break;
-  }
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
     // Expanding an ATOMIC_CMP_SWAP_WITH_SUCCESS produces an ATOMIC_CMP_SWAP and
     // splits out the success value as a comparison. Expanding the resulting
Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1327,7 +1327,6 @@
   case ISD::UDIV:        ExpandIntRes_UDIV(N, Lo, Hi); break;
   case ISD::UREM:        ExpandIntRes_UREM(N, Lo, Hi); break;
   case ISD::ZERO_EXTEND: ExpandIntRes_ZERO_EXTEND(N, Lo, Hi); break;
-  case ISD::ATOMIC_LOAD: ExpandIntRes_ATOMIC_LOAD(N, Lo, Hi); break;
 
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
@@ -2700,24 +2699,6 @@
   }
 }
 
-void DAGTypeLegalizer::ExpandIntRes_ATOMIC_LOAD(SDNode *N,
-                                                SDValue &Lo, SDValue &Hi) {
-  SDLoc dl(N);
-  EVT VT = cast<AtomicSDNode>(N)->getMemoryVT();
-  SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
-  SDValue Zero = DAG.getConstant(0, dl, VT);
-  SDValue Swap = DAG.getAtomicCmpSwap(
-      ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl,
-      cast<AtomicSDNode>(N)->getMemoryVT(), VTs, N->getOperand(0),
-      N->getOperand(1), Zero, Zero, cast<AtomicSDNode>(N)->getMemOperand(),
-      cast<AtomicSDNode>(N)->getOrdering(),
-      cast<AtomicSDNode>(N)->getOrdering(),
-      cast<AtomicSDNode>(N)->getSynchScope());
-
-  ReplaceValueWith(SDValue(N, 0), Swap.getValue(0));
-  ReplaceValueWith(SDValue(N, 1), Swap.getValue(2));
-}
-
 //===----------------------------------------------------------------------===//
 //  Integer Operand Expansion
 //===----------------------------------------------------------------------===//
@@ -2762,8 +2743,6 @@
   case ISD::ROTR:              Res = ExpandIntOp_Shift(N); break;
   case ISD::RETURNADDR:
   case ISD::FRAMEADDR:         Res = ExpandIntOp_RETURNADDR(N); break;
-
-  case ISD::ATOMIC_STORE:      Res = ExpandIntOp_ATOMIC_STORE(N); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -3198,19 +3177,6 @@
   return TLI.makeLibCall(DAG, LC, DstVT, Op, true, dl).first;
 }
 
-SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
-  SDLoc dl(N);
-  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
-                               cast<AtomicSDNode>(N)->getMemoryVT(),
-                               N->getOperand(0),
-                               N->getOperand(1), N->getOperand(2),
-                               cast<AtomicSDNode>(N)->getMemOperand(),
-                               cast<AtomicSDNode>(N)->getOrdering(),
-                               cast<AtomicSDNode>(N)->getSynchScope());
-  return Swap.getValue(1);
-}
-
-
 SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
   SDValue InOp0 = N->getOperand(0);
   EVT InVT = InOp0.getValueType();
Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -386,7 +386,6 @@
   SDValue ExpandIntOp_TRUNCATE(SDNode *N);
   SDValue ExpandIntOp_UINT_TO_FP(SDNode *N);
   SDValue ExpandIntOp_RETURNADDR(SDNode *N);
-  SDValue ExpandIntOp_ATOMIC_STORE(SDNode *N);
 
   void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
                                   ISD::CondCode &CCCode, const SDLoc &dl);
Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1435,6 +1435,10 @@
 }
 
 SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
+  if (!Sym)
+    report_fatal_error(
+        "Attempted to use null symbol in SelectionDAG::getExternalSymbol!");
+
   SDNode *&N = ExternalSymbols[Sym];
   if (N) return SDValue(N, 0);
   N = newSDNode<ExternalSymbolSDNode>(false, Sym, 0, VT);
@@ -1453,6 +1457,10 @@
 
 SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
                                               unsigned char TargetFlags) {
+  if (!Sym)
+    report_fatal_error("Attempted to use null symbol in "
+                       "SelectionDAG::getTargetExternalSymbol!");
+
   SDNode *&N =
     TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
                                                                TargetFlags)];
Index: lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- lib/CodeGen/TargetLoweringBase.cpp
+++ lib/CodeGen/TargetLoweringBase.cpp
@@ -353,66 +353,6 @@
   Names[RTLIB::MEMMOVE] = "memmove";
   Names[RTLIB::MEMSET] = "memset";
   Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume";
-  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1";
-  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
-  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = "__sync_val_compare_and_swap_4";
-  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = "__sync_val_compare_and_swap_8";
-  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16] = "__sync_val_compare_and_swap_16";
-  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = "__sync_lock_test_and_set_1";
-  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = "__sync_lock_test_and_set_2";
-  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = "__sync_lock_test_and_set_4";
-  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = "__sync_lock_test_and_set_8";
-  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_16] = "__sync_lock_test_and_set_16";
-  Names[RTLIB::SYNC_FETCH_AND_ADD_1] = "__sync_fetch_and_add_1";
-  Names[RTLIB::SYNC_FETCH_AND_ADD_2] = "__sync_fetch_and_add_2";
-  Names[RTLIB::SYNC_FETCH_AND_ADD_4] = "__sync_fetch_and_add_4";
-  Names[RTLIB::SYNC_FETCH_AND_ADD_8] = "__sync_fetch_and_add_8";
-  Names[RTLIB::SYNC_FETCH_AND_ADD_16] = "__sync_fetch_and_add_16";
-  Names[RTLIB::SYNC_FETCH_AND_SUB_1] = "__sync_fetch_and_sub_1";
-  Names[RTLIB::SYNC_FETCH_AND_SUB_2] = "__sync_fetch_and_sub_2";
-  Names[RTLIB::SYNC_FETCH_AND_SUB_4] = "__sync_fetch_and_sub_4";
-  Names[RTLIB::SYNC_FETCH_AND_SUB_8] = "__sync_fetch_and_sub_8";
-  Names[RTLIB::SYNC_FETCH_AND_SUB_16] = "__sync_fetch_and_sub_16";
-  Names[RTLIB::SYNC_FETCH_AND_AND_1] = "__sync_fetch_and_and_1";
-  Names[RTLIB::SYNC_FETCH_AND_AND_2] = "__sync_fetch_and_and_2";
-  Names[RTLIB::SYNC_FETCH_AND_AND_4] = "__sync_fetch_and_and_4";
-  Names[RTLIB::SYNC_FETCH_AND_AND_8] = "__sync_fetch_and_and_8";
-  Names[RTLIB::SYNC_FETCH_AND_AND_16] = "__sync_fetch_and_and_16";
-  Names[RTLIB::SYNC_FETCH_AND_OR_1] = "__sync_fetch_and_or_1";
-  Names[RTLIB::SYNC_FETCH_AND_OR_2] = "__sync_fetch_and_or_2";
-  Names[RTLIB::SYNC_FETCH_AND_OR_4] = "__sync_fetch_and_or_4";
-  Names[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
-  Names[RTLIB::SYNC_FETCH_AND_OR_16] = "__sync_fetch_and_or_16";
-  Names[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
-  Names[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
-  Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and_xor_4";
-  Names[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
-  Names[RTLIB::SYNC_FETCH_AND_XOR_16] = "__sync_fetch_and_xor_16";
-  Names[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
-  Names[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
-  Names[RTLIB::SYNC_FETCH_AND_NAND_4] = "__sync_fetch_and_nand_4";
-  Names[RTLIB::SYNC_FETCH_AND_NAND_8] = "__sync_fetch_and_nand_8";
-  Names[RTLIB::SYNC_FETCH_AND_NAND_16] = "__sync_fetch_and_nand_16";
-  Names[RTLIB::SYNC_FETCH_AND_MAX_1] = "__sync_fetch_and_max_1";
-  Names[RTLIB::SYNC_FETCH_AND_MAX_2] = "__sync_fetch_and_max_2";
-  Names[RTLIB::SYNC_FETCH_AND_MAX_4] = "__sync_fetch_and_max_4";
-  Names[RTLIB::SYNC_FETCH_AND_MAX_8] = "__sync_fetch_and_max_8";
-  Names[RTLIB::SYNC_FETCH_AND_MAX_16] = "__sync_fetch_and_max_16";
-  Names[RTLIB::SYNC_FETCH_AND_UMAX_1] = "__sync_fetch_and_umax_1";
-  Names[RTLIB::SYNC_FETCH_AND_UMAX_2] = "__sync_fetch_and_umax_2";
-  Names[RTLIB::SYNC_FETCH_AND_UMAX_4] = "__sync_fetch_and_umax_4";
-  Names[RTLIB::SYNC_FETCH_AND_UMAX_8] = "__sync_fetch_and_umax_8";
-  Names[RTLIB::SYNC_FETCH_AND_UMAX_16] = "__sync_fetch_and_umax_16";
-  Names[RTLIB::SYNC_FETCH_AND_MIN_1] = "__sync_fetch_and_min_1";
-  Names[RTLIB::SYNC_FETCH_AND_MIN_2] = "__sync_fetch_and_min_2";
-  Names[RTLIB::SYNC_FETCH_AND_MIN_4] = "__sync_fetch_and_min_4";
-  Names[RTLIB::SYNC_FETCH_AND_MIN_8] = "__sync_fetch_and_min_8";
-  Names[RTLIB::SYNC_FETCH_AND_MIN_16] = "__sync_fetch_and_min_16";
-  Names[RTLIB::SYNC_FETCH_AND_UMIN_1] = "__sync_fetch_and_umin_1";
-  Names[RTLIB::SYNC_FETCH_AND_UMIN_2] = "__sync_fetch_and_umin_2";
-  Names[RTLIB::SYNC_FETCH_AND_UMIN_4] = "__sync_fetch_and_umin_4";
-  Names[RTLIB::SYNC_FETCH_AND_UMIN_8] = "__sync_fetch_and_umin_8";
-  Names[RTLIB::SYNC_FETCH_AND_UMIN_16] = "__sync_fetch_and_umin_16";
 
   Names[RTLIB::ATOMIC_LOAD] = "__atomic_load";
   Names[RTLIB::ATOMIC_LOAD_1] = "__atomic_load_1";
@@ -488,6 +428,85 @@
   Names[RTLIB::DEOPTIMIZE] = "__llvm_deoptimize";
 }
 
+void TargetLoweringBase::initSyncLibcalls() {
+  LibcallRoutineNames[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] =
+      "__sync_val_compare_and_swap_1";
+  LibcallRoutineNames[RTLIB::SYNC_LOCK_TEST_AND_SET_1] =
+      "__sync_lock_test_and_set_1";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_ADD_1] = "__sync_fetch_and_add_1";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_SUB_1] = "__sync_fetch_and_sub_1";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_AND_1] = "__sync_fetch_and_and_1";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_OR_1] = "__sync_fetch_and_or_1";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_MAX_1] = "__sync_fetch_and_max_1";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_UMAX_1] = "__sync_fetch_and_umax_1";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_MIN_1] = "__sync_fetch_and_min_1";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_UMIN_1] = "__sync_fetch_and_umin_1";
+
+  LibcallRoutineNames[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] =
+      "__sync_val_compare_and_swap_2";
+  LibcallRoutineNames[RTLIB::SYNC_LOCK_TEST_AND_SET_2] =
+      "__sync_lock_test_and_set_2";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_ADD_2] = "__sync_fetch_and_add_2";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_SUB_2] = "__sync_fetch_and_sub_2";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_AND_2] = "__sync_fetch_and_and_2";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_OR_2] = "__sync_fetch_and_or_2";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_MAX_2] = "__sync_fetch_and_max_2";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_UMAX_2] = "__sync_fetch_and_umax_2";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_MIN_2] = "__sync_fetch_and_min_2";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_UMIN_2] = "__sync_fetch_and_umin_2";
+
+  LibcallRoutineNames[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] =
+      "__sync_val_compare_and_swap_4";
+  LibcallRoutineNames[RTLIB::SYNC_LOCK_TEST_AND_SET_4] =
+      "__sync_lock_test_and_set_4";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_ADD_4] = "__sync_fetch_and_add_4";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_SUB_4] = "__sync_fetch_and_sub_4";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_AND_4] = "__sync_fetch_and_and_4";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_OR_4] = "__sync_fetch_and_or_4";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and_xor_4";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_NAND_4] = "__sync_fetch_and_nand_4";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_MAX_4] = "__sync_fetch_and_max_4";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_UMAX_4] = "__sync_fetch_and_umax_4";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_MIN_4] = "__sync_fetch_and_min_4";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_UMIN_4] = "__sync_fetch_and_umin_4";
+
+  LibcallRoutineNames[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] =
+      "__sync_val_compare_and_swap_8";
+  LibcallRoutineNames[RTLIB::SYNC_LOCK_TEST_AND_SET_8] =
+      "__sync_lock_test_and_set_8";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_ADD_8] = "__sync_fetch_and_add_8";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_SUB_8] = "__sync_fetch_and_sub_8";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_AND_8] = "__sync_fetch_and_and_8";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_NAND_8] = "__sync_fetch_and_nand_8";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_MAX_8] = "__sync_fetch_and_max_8";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_UMAX_8] = "__sync_fetch_and_umax_8";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_MIN_8] = "__sync_fetch_and_min_8";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_UMIN_8] = "__sync_fetch_and_umin_8";
+
+  LibcallRoutineNames[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16] =
+      "__sync_val_compare_and_swap_16";
+  LibcallRoutineNames[RTLIB::SYNC_LOCK_TEST_AND_SET_16] =
+      "__sync_lock_test_and_set_16";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_ADD_16] = "__sync_fetch_and_add_16";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_SUB_16] = "__sync_fetch_and_sub_16";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_AND_16] = "__sync_fetch_and_and_16";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_OR_16] = "__sync_fetch_and_or_16";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_XOR_16] = "__sync_fetch_and_xor_16";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_NAND_16] =
+      "__sync_fetch_and_nand_16";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_MAX_16] = "__sync_fetch_and_max_16";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_UMAX_16] =
+      "__sync_fetch_and_umax_16";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_MIN_16] = "__sync_fetch_and_min_16";
+  LibcallRoutineNames[RTLIB::SYNC_FETCH_AND_UMIN_16] =
+      "__sync_fetch_and_umin_16";
+}
 /// InitLibcallCallingConvs - Set default libcall CallingConvs.
 ///
 static void InitLibcallCallingConvs(CallingConv::ID *CCs) {
@@ -826,9 +845,7 @@
   GatherAllAliasesMaxDepth = 6;
   MinStackArgumentAlignment = 1;
   MinimumJumpTableEntries = 4;
-  // TODO: the default will be switched to 0 in the next commit, along
-  // with the Target-specific changes necessary.
-  MaxAtomicSizeInBitsSupported = 1024;
+  MaxAtomicSizeInBitsSupported = 0;
 
   MinCmpXchgSizeInBits = 0;
 
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -639,6 +639,8 @@
   }
 
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
+
+  setMaxAtomicSizeInBitsSupported(128);
 }
 
 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
@@ -10208,28 +10210,29 @@
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
-// Loads and stores less than 128-bits are already atomic; ones above that
-// are doomed anyway, so defer to the default libcall and blame the OS when
-// things go wrong.
+// Loads and stores less than 128-bits are already atomic; 128-bit
+// ones can only be done via ldaxp/stlxp sequences, so must be expanded.
 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+  assert(Size <= 128 &&
+         "Sizes above 128 should've been handled by AtomicExpandPass");
   return Size == 128;
 }
 
-// Loads and stores less than 128-bits are already atomic; ones above that
-// are doomed anyway, so defer to the default libcall and blame the OS when
-// things go wrong.
+// Loads and stores less than 128-bits are already atomic; 128-bit
+// ones can only be done via ldaxp/stlxp sequences, so must be expanded.
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+  assert(Size <= 128 &&
+         "Sizes above 128 should've been handled by AtomicExpandPass");
   return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
 }
 
-// For the real atomic operations, we have ldxr/stxr up to 128 bits,
+// Expand RMW operations to ldrex/strex instructions.
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
-  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
+  return AtomicExpansionKind::LLSC;
 }
 
 bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
Index: lib/Target/ARM/ARMISelLowering.h
===================================================================
--- lib/Target/ARM/ARMISelLowering.h
+++ lib/Target/ARM/ARMISelLowering.h
@@ -493,10 +493,6 @@
     ///
     unsigned ARMPCLabelIndex;
 
-    // TODO: remove this, and have shouldInsertFencesForAtomic do the proper
-    // check.
-    bool InsertFencesForAtomic;
-
     void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
     void addDRTypeForNEON(MVT VT);
     void addQRTypeForNEON(MVT VT);
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -842,48 +842,51 @@
   else
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
-  // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
-  // the default expansion.
-  InsertFencesForAtomic = false;
-  if (Subtarget->hasAnyDataBarrier() &&
-      (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
-    // ATOMIC_FENCE needs custom lowering; the others should have been expanded
-    // to ldrex/strex loops already.
-    setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
-    if (!Subtarget->isThumb() || !Subtarget->isMClass())
+  // Processors that support ldrex have native lock-free atomics.
+  //
+  // And, OSes that have cmpxchg via kernel support can use atomics
+  // regardless (with expansion to __sync_* libcalls as needed).
+  //
+  if (Subtarget->hasLdrex() || Subtarget->isTargetDarwin() ||
+      Subtarget->isTargetLinux()) {
+    // The Cortex-M only supports up to 32bit operations, while
+    // everything else supports 64-bit (via the ldrexd intrinsic
+    // expansion).
+    if (Subtarget->isMClass())
+      setMaxAtomicSizeInBitsSupported(32);
+    else
+      setMaxAtomicSizeInBitsSupported(64);
+
+    // When we're relying on OS cmpxchg support, set everything but
+    // ATOMIC_LOAD/ATOMIC_STORE for expansion, so we will emit
+    // __sync_* libcalls. (load and store themselves are atomic on all
+    // CPUs)
+    if (!Subtarget->hasLdrex()) {
+      initSyncLibcalls();
+      setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand);
+      setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand);
+      setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand);
+      setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
+      setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand);
+      setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand);
+      setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand);
+      setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
+      setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
+      setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
+      setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
+      setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
+    } else {
+      // This is part of the hack for -O0 mode: in other modes cmpxchg is
+      // translated into ldrex/strex, so no ATOMIC_CMP_SWAP is seen.
       setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
-
-    // On v8, we have particularly efficient implementations of atomic fences
-    // if they can be combined with nearby atomic loads and stores.
-    if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
-      // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
-      InsertFencesForAtomic = true;
     }
-  } else {
-    // If there's anything we can use as a barrier, go through custom lowering
-    // for ATOMIC_FENCE.
-    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
-                       Subtarget->hasAnyDataBarrier() ? Custom : Expand);
-
-    // Set them all for expansion, which will force libcalls.
-    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
-    // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
-    // Unordered/Monotonic case.
-    setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
-    setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
   }
 
+  // If there's anything we can use as a barrier, go through custom lowering
+  // for ATOMIC_FENCE.
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
+                     Subtarget->hasAnyDataBarrier() ? Custom : Expand);
+
   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
 
   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
@@ -6934,16 +6937,6 @@
   Results.push_back(Upper);
 }
 
-static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
-  if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
-    // Acquire/Release load/store is not legal for targets without a dmb or
-    // equivalent available.
-    return SDValue();
-
-  // Monotonic load/store is legal for all targets.
-  return Op;
-}
-
 static void ReplaceREADCYCLECOUNTER(SDNode *N,
                                     SmallVectorImpl<SDValue> &Results,
                                     SelectionDAG &DAG,
@@ -7082,8 +7075,6 @@
   case ISD::SSUBO:
   case ISD::USUBO:
     return LowerXALUO(Op, DAG);
-  case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
@@ -12117,8 +12108,6 @@
   // First, if the target has no DMB, see what fallback we can use.
   if (!Subtarget->hasDataBarrier()) {
     // Some ARMv6 cpus can support data barriers with an mcr instruction.
-    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
-    // here.
     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
       Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
@@ -12126,9 +12115,10 @@
                         Builder.getInt32(10), Builder.getInt32(5)};
       return Builder.CreateCall(MCR, args);
     } else {
-      // Instead of using barriers, atomic accesses on these subtargets use
-      // libcalls.
-      llvm_unreachable("makeDMB on a target so old that it has no barriers");
+      // Instead of barriers, atomic accesses on Thumb1 and pre-v6 ARM
+      // mode just use a libcall to __sync_synchronize. So, just emit
+      // a fence instruction.
+      return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
     }
   } else {
     Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
@@ -12183,41 +12173,66 @@
   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
 }
 
-// Loads and stores less than 64-bits are already atomic; ones above that
-// are doomed anyway, so defer to the default libcall and blame the OS when
-// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
-// anything for those.
+// In the following "should*Atomic*" routines, there's two cases to consider:
+// 1) We have native atomics (hasLdrex() == true)
+//
+// 2) We don't actually have native atomics, but we have told AtomicExpandPass
+// that we do, because we're on an OS that provides a "magic" lock-free
+// compare-and-swap routine. In the latter case, we rely on __sync libcall
+// expansions for all the operations.
+//
+// The other possibility is that we have neither native atomics, nor special OS
+// routines allowing lock-free libcalls. However, then, expansion to __atomic_*
+// calls will happen in AtomicExpandPass (due to MaxAtomicSizeInBitsSupported =
+// 0), and the below routines will not be called. So, here, we're only concerned
+// with the first two cases.
+//
+// If we are using libcalls, cmpxchg and rmw operations are desired. If we're
+// using native instructions ll/sc expansions are needed.
+
+// Loads and stores less than 64-bits are intrinsically atomic. For 64-bit
+// operations, we can replace with ldrexd/strexd.
+//
+// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
+// guarantee, see DDI0406C ARM architecture reference manual, sections
+// A8.8.72-74 LDRD); on such CPUs it would be advantageous to not expand 64-bit
+// loads and stores to LL/SC sequences.
 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
-  return (Size == 64) && !Subtarget->isMClass();
+  assert(Size <= 64 &&
+         "Sizes above 64 should've been handled by AtomicExpandPass");
+  return Size == 64;
 }
 
-// Loads and stores less than 64-bits are already atomic; ones above that
-// are doomed anyway, so defer to the default libcall and blame the OS when
-// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
-// anything for those.
-// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
-// guarantee, see DDI0406C ARM architecture reference manual,
-// sections A8.8.72-74 LDRD)
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
-  return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
-                                                  : AtomicExpansionKind::None;
+  assert(Size <= 64 &&
+         "Sizes above 64 should've been handled by AtomicExpandPass");
+  if (Size != 64)
+    return AtomicExpansionKind::None;
+
+  if (!Subtarget->hasLdrex())
+    // will expand to cmpxchg libcall.
+    return AtomicExpansionKind::CmpXChg;
+
+  return AtomicExpansionKind::LLOnly;
 }
 
-// For the real atomic operations, we have ldrex/strex up to 32 bits,
-// and up to 64 bits on the non-M profiles
+// For the more complex atomic operations, we use LL/SC instead of
+// cmpxchg.
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
-  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  return (Size <= (Subtarget->isMClass() ? 32U : 64U))
-             ? AtomicExpansionKind::LLSC
-             : AtomicExpansionKind::None;
+  if (!Subtarget->hasLdrex())
+    return AtomicExpansionKind::None;
+  return AtomicExpansionKind::LLSC;
 }
 
 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
+  if (!Subtarget->hasLdrex())
+    return false;
+
   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
   // implement cmpxchg without spilling. If the address being exchanged is also
   // on the stack and close enough to the spill slot, this can lead to a
@@ -12228,7 +12243,26 @@
 
 bool ARMTargetLowering::shouldInsertFencesForAtomic(
     const Instruction *I) const {
-  return InsertFencesForAtomic;
+  // On cpus without ldrex, we emit __sync_* libcalls. These don't need
+  // barriers, as they already have appropriate barriers within. However, Load
+  // and Store are still handled directly, and thus need barriers.
+  if (!Subtarget->hasLdrex()) {
+    return isa<LoadInst>(I) || isa<StoreInst>(I);
+  }
+
+  // In -O0 mode, there's a hack in place to expand ATOMIC_CMP_SWAP in a late
+  // pseudo expansion instead of in IR. This pseduo requires fences to be
+  // emitted externally externally.
+  if (getTargetMachine().getOptLevel() == 0 && isa<AtomicCmpXchgInst>(I))
+    return true;
+
+  // On v8, we have particularly efficient implementations of atomic fences
+  // if they can be combined with nearby atomic loads and stores.
+  if (Subtarget->hasV8Ops())
+    return false;
+
+  // Automatically insert fences (dmb ish) around all atomic operations.
+  return true;
 }
 
 // This has so far only been implemented for MachO.
Index: lib/Target/ARM/ARMSubtarget.h
===================================================================
--- lib/Target/ARM/ARMSubtarget.h
+++ lib/Target/ARM/ARMSubtarget.h
@@ -492,8 +492,10 @@
   /// True for some subtargets at > -O0.
   bool enablePostRAScheduler() const override;
 
-  // enableAtomicExpand- True if we need to expand our atomics.
-  bool enableAtomicExpand() const override;
+  // True for targets that support atomic ldrex/strex.
+  bool hasLdrex() const {
+    return HasV6Ops && (!InThumbMode || HasV8MBaselineOps);
+  }
 
   /// getInstrItins - Return the instruction itineraries based on subtarget
   /// selection.
Index: lib/Target/ARM/ARMSubtarget.cpp
===================================================================
--- lib/Target/ARM/ARMSubtarget.cpp
+++ lib/Target/ARM/ARMSubtarget.cpp
@@ -317,10 +317,6 @@
   return (!isThumb() || hasThumb2());
 }
 
-bool ARMSubtarget::enableAtomicExpand() const {
-  return hasAnyDataBarrier() && (!isThumb() || hasV8MBaselineOps());
-}
-
 bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const {
   // For general targets, the prologue can grow when VFPs are allocated with
   // stride 4 (more vpush instructions). But WatchOS uses a compact unwind
Index: lib/Target/BPF/BPFISelLowering.cpp
===================================================================
--- lib/Target/BPF/BPFISelLowering.cpp
+++ lib/Target/BPF/BPFISelLowering.cpp
@@ -63,6 +63,8 @@
 
   setStackPointerRegisterToSaveRestore(BPF::R11);
 
+  setMaxAtomicSizeInBitsSupported(64);
+
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
Index: lib/Target/Hexagon/HexagonISelLowering.h
===================================================================
--- lib/Target/Hexagon/HexagonISelLowering.h
+++ lib/Target/Hexagon/HexagonISelLowering.h
@@ -254,12 +254,13 @@
         AtomicOrdering Ord) const override;
     Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
         Value *Addr, AtomicOrdering Ord) const override;
-    AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
-    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
     AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override {
       return AtomicExpansionKind::LLSC;
     }
+    bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override {
+      return true;
+    }
 
   protected:
     std::pair<const TargetRegisterClass*, uint8_t>
Index: lib/Target/Hexagon/HexagonISelLowering.cpp
===================================================================
--- lib/Target/Hexagon/HexagonISelLowering.cpp
+++ lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1713,6 +1713,7 @@
   setPrefLoopAlignment(4);
   setPrefFunctionAlignment(4);
   setMinFunctionAlignment(2);
+  setMaxAtomicSizeInBitsSupported(64);
   setStackPointerRegisterToSaveRestore(HRI.getStackRegister());
 
   if (EnableHexSDNodeSched)
@@ -3108,16 +3109,3 @@
   Value *Ext = Builder.CreateZExt(Cmp, Type::getInt32Ty(M->getContext()));
   return Ext;
 }
-
-TargetLowering::AtomicExpansionKind
-HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
-  // Do not expand loads and stores that don't exceed 64 bits.
-  return LI->getType()->getPrimitiveSizeInBits() > 64
-             ? AtomicExpansionKind::LLOnly
-             : AtomicExpansionKind::None;
-}
-
-bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
-  // Do not expand loads and stores that don't exceed 64 bits.
-  return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64;
-}
Index: lib/Target/Mips/Mips16ISelLowering.cpp
===================================================================
--- lib/Target/Mips/Mips16ISelLowering.cpp
+++ lib/Target/Mips/Mips16ISelLowering.cpp
@@ -128,6 +128,10 @@
   if (!Subtarget.useSoftFloat())
     setMips16HardFloatLibCalls();
 
+  // Call __sync_* library calls for most atomic instructions; the
+  // MIPS16 ISA has no ll/sc or fence instructions, but it can call mips32
+  // functions to do the work.
+  initSyncLibcalls();
   setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Expand);
   setOperationAction(ISD::ATOMIC_CMP_SWAP,    MVT::i32,   Expand);
   setOperationAction(ISD::ATOMIC_SWAP,        MVT::i32,   Expand);
Index: lib/Target/Mips/MipsISelLowering.cpp
===================================================================
--- lib/Target/Mips/MipsISelLowering.cpp
+++ lib/Target/Mips/MipsISelLowering.cpp
@@ -387,11 +387,10 @@
   setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
 
-  if (!Subtarget.isGP64bit()) {
-    setOperationAction(ISD::ATOMIC_LOAD,     MVT::i64,   Expand);
-    setOperationAction(ISD::ATOMIC_STORE,    MVT::i64,   Expand);
-  }
-
+  if (Subtarget.isGP64bit())
+    setMaxAtomicSizeInBitsSupported(64);
+  else
+    setMaxAtomicSizeInBitsSupported(32);
 
   if (!Subtarget.hasMips32r2()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
Index: lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.cpp
+++ lib/Target/PowerPC/PPCISelLowering.cpp
@@ -828,11 +828,6 @@
 
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);
 
-  if (!isPPC64) {
-    setOperationAction(ISD::ATOMIC_LOAD,  MVT::i64, Expand);
-    setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
-  }
-
   setBooleanContents(ZeroOrOneBooleanContent);
 
   if (Subtarget.hasAltivec()) {
@@ -923,6 +918,7 @@
     break;
   }
 
+  setMaxAtomicSizeInBitsSupported(isPPC64 ? 64 : 32);
 
   if (Subtarget.enableMachineScheduler())
     setSchedulingPreference(Sched::Source);
Index: lib/Target/Sparc/SparcISelLowering.cpp
===================================================================
--- lib/Target/Sparc/SparcISelLowering.cpp
+++ lib/Target/Sparc/SparcISelLowering.cpp
@@ -1644,8 +1644,6 @@
     // Test made to fail pending completion of AtomicExpandPass,
     // as this will cause a regression until that work is completed.
     setMaxAtomicSizeInBitsSupported(32);
-  else
-    setMaxAtomicSizeInBitsSupported(0);
 
   setMinCmpXchgSizeInBits(32);
 
@@ -1653,15 +1651,9 @@
 
   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Legal);
 
-  // Custom Lower Atomic LOAD/STORE
-  setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
-
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Legal);
     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Legal);
-    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Custom);
   }
 
   if (!Subtarget->isV9()) {
@@ -2996,15 +2988,6 @@
   return DAG.getMergeValues(Ops, dl);
 }
 
-static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
-  if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
-  // Expand with a fence.
-  return SDValue();
-
-  // Monotonic load/stores are legal.
-  return Op;
-}
-
 SDValue SparcTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -3076,8 +3059,6 @@
   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::UMULO:
   case ISD::SMULO:              return LowerUMULO_SMULO(Op, DAG, *this);
-  case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_STORE:       return LowerATOMIC_LOAD_STORE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   }
 }
Index: lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- lib/Target/SystemZ/SystemZISelLowering.cpp
+++ lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -128,6 +128,8 @@
   // Instructions are strings of 2-byte aligned 2-byte values.
   setMinFunctionAlignment(2);
 
+  setMaxAtomicSizeInBitsSupported(64);
+
   // Handle operations that are handled in a similar way for all types.
   for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
        I <= MVT::LAST_FP_VALUETYPE;
Index: lib/Target/TargetSubtargetInfo.cpp
===================================================================
--- lib/Target/TargetSubtargetInfo.cpp
+++ lib/Target/TargetSubtargetInfo.cpp
@@ -28,10 +28,6 @@
 
 TargetSubtargetInfo::~TargetSubtargetInfo() {}
 
-bool TargetSubtargetInfo::enableAtomicExpand() const {
-  return true;
-}
-
 bool TargetSubtargetInfo::enableMachineScheduler() const {
   return false;
 }
Index: lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
===================================================================
--- lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -49,6 +49,11 @@
   // Tell ISel that we have a stack pointer.
   setStackPointerRegisterToSaveRestore(
       Subtarget->hasAddr64() ? WebAssembly::SP64 : WebAssembly::SP32);
+  // Maximum atomics size
+  if (Subtarget->hasAddr64())
+    setMaxAtomicSizeInBitsSupported(64);
+  else
+    setMaxAtomicSizeInBitsSupported(32);
   // Set up the register classes.
   addRegisterClass(MVT::i32, &WebAssembly::I32RegClass);
   addRegisterClass(MVT::i64, &WebAssembly::I64RegClass);
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -84,6 +84,17 @@
   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
+  if (Subtarget.is64Bit()) {
+    if (Subtarget.hasCmpxchg16b())
+      setMaxAtomicSizeInBitsSupported(128);
+    else
+      setMaxAtomicSizeInBitsSupported(64);
+  } else {
+    // FIXME: Check that we actually have cmpxchg (i486 or later)
+    // FIXME: Check that we actually have cmpxchg8b (i586 or later)
+    setMaxAtomicSizeInBitsSupported(64);
+  }
+
   // For 64-bit, since we have so many registers, use the ILP scheduler.
   // For 32-bit, use the register pressure specific scheduling.
   // For Atom, always use ILP scheduling.
@@ -20417,32 +20428,27 @@
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
-/// Returns true if the operand type is exactly twice the native width, and
-/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
-/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
-/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
-bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
-  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
-
-  if (OpWidth == 64)
-    return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
-  else if (OpWidth == 128)
-    return Subtarget.hasCmpxchg16b();
-  else
-    return false;
-}
+// Atomic operations larger than the normal register size can only be
+// done with cmpxchg8b/16b, so expand loads/stores to cmpxchg if
+// required.
 
+// (Note: we don't need to worry about those instructions not being
+// available, because larger-than-supported IR instructions will
+// already have been transformed into __atomic_* libcalls if needed)
 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
-  return needsCmpXchgNb(SI->getValueOperand()->getType());
+  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+  return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() >
+         NativeWidth;
 }
 
 // Note: this turns large loads into lock cmpxchg8b/16b.
 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
-  auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
-  return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
-                                               : AtomicExpansionKind::None;
+  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+  return (LI->getType()->getPrimitiveSizeInBits() > NativeWidth)
+             ? AtomicExpansionKind::CmpXChg
+             : AtomicExpansionKind::None;
 }
 
 TargetLowering::AtomicExpansionKind
@@ -20450,12 +20456,9 @@
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
   Type *MemType = AI->getType();
 
-  // If the operand is too big, we must see if cmpxchg8/16b is available
-  // and default to library calls otherwise.
-  if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
-    return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
-                                   : AtomicExpansionKind::None;
-  }
+  // If the operand is too big, we need to use cmpxchg8b/16b.
+  if (MemType->getPrimitiveSizeInBits() > NativeWidth)
+    return AtomicExpansionKind::CmpXChg;
 
   AtomicRMWInst::BinOp Op = AI->getOperation();
   switch (Op) {
@@ -20616,7 +20619,7 @@
   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
-  return SDValue();
+  return Op;
 }
 
 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
@@ -21091,7 +21094,7 @@
   // RAUW the chain, but don't worry about the result, as it's unused.
   assert(!N->hasAnyUseOfValue(0));
   DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
-  return SDValue();
+  return LockOp;
 }
 
 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
Index: lib/Target/XCore/XCoreISelLowering.h
===================================================================
--- lib/Target/XCore/XCoreISelLowering.h
+++ lib/Target/XCore/XCoreISelLowering.h
@@ -185,8 +185,6 @@
     SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
 
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass *>
@@ -225,9 +223,6 @@
                      bool isVarArg,
                      const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
                      LLVMContext &Context) const override;
-    bool shouldInsertFencesForAtomic(const Instruction *I) const override {
-      return true;
-    }
   };
 }
 
Index: lib/Target/XCore/XCoreISelLowering.cpp
===================================================================
--- lib/Target/XCore/XCoreISelLowering.cpp
+++ lib/Target/XCore/XCoreISelLowering.cpp
@@ -151,12 +151,7 @@
   setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 
-  // Atomic operations
-  // We request a fence for ATOMIC_* instructions, to reduce them to Monotonic.
-  // As we are always Sequential Consistent, an ATOMIC_FENCE becomes a no OP.
   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
 
   // TRAMPOLINE is custom lowered.
   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
@@ -222,8 +217,6 @@
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
-  case ISD::ATOMIC_LOAD:        return LowerATOMIC_LOAD(Op, DAG);
-  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
   default:
     llvm_unreachable("unimplemented operand");
   }
@@ -963,68 +956,6 @@
   return DAG.getNode(XCoreISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }
 
-SDValue XCoreTargetLowering::
-LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const {
-  AtomicSDNode *N = cast<AtomicSDNode>(Op);
-  assert(N->getOpcode() == ISD::ATOMIC_LOAD && "Bad Atomic OP");
-  assert((N->getOrdering() == AtomicOrdering::Unordered ||
-          N->getOrdering() == AtomicOrdering::Monotonic) &&
-         "setInsertFencesForAtomic(true) expects unordered / monotonic");
-  if (N->getMemoryVT() == MVT::i32) {
-    if (N->getAlignment() < 4)
-      report_fatal_error("atomic load must be aligned");
-    return DAG.getLoad(getPointerTy(DAG.getDataLayout()), SDLoc(Op),
-                       N->getChain(), N->getBasePtr(), N->getPointerInfo(),
-                       N->isVolatile(), N->isNonTemporal(), N->isInvariant(),
-                       N->getAlignment(), N->getAAInfo(), N->getRanges());
-  }
-  if (N->getMemoryVT() == MVT::i16) {
-    if (N->getAlignment() < 2)
-      report_fatal_error("atomic load must be aligned");
-    return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
-                          N->getBasePtr(), N->getPointerInfo(), MVT::i16,
-                          N->isVolatile(), N->isNonTemporal(),
-                          N->isInvariant(), N->getAlignment(), N->getAAInfo());
-  }
-  if (N->getMemoryVT() == MVT::i8)
-    return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
-                          N->getBasePtr(), N->getPointerInfo(), MVT::i8,
-                          N->isVolatile(), N->isNonTemporal(),
-                          N->isInvariant(), N->getAlignment(), N->getAAInfo());
-  return SDValue();
-}
-
-SDValue XCoreTargetLowering::
-LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const {
-  AtomicSDNode *N = cast<AtomicSDNode>(Op);
-  assert(N->getOpcode() == ISD::ATOMIC_STORE && "Bad Atomic OP");
-  assert((N->getOrdering() == AtomicOrdering::Unordered ||
-          N->getOrdering() == AtomicOrdering::Monotonic) &&
-         "setInsertFencesForAtomic(true) expects unordered / monotonic");
-  if (N->getMemoryVT() == MVT::i32) {
-    if (N->getAlignment() < 4)
-      report_fatal_error("atomic store must be aligned");
-    return DAG.getStore(N->getChain(), SDLoc(Op), N->getVal(),
-                        N->getBasePtr(), N->getPointerInfo(),
-                        N->isVolatile(), N->isNonTemporal(),
-                        N->getAlignment(), N->getAAInfo());
-  }
-  if (N->getMemoryVT() == MVT::i16) {
-    if (N->getAlignment() < 2)
-      report_fatal_error("atomic store must be aligned");
-    return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
-                             N->getBasePtr(), N->getPointerInfo(), MVT::i16,
-                             N->isVolatile(), N->isNonTemporal(),
-                             N->getAlignment(), N->getAAInfo());
-  }
-  if (N->getMemoryVT() == MVT::i8)
-    return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
-                             N->getBasePtr(), N->getPointerInfo(), MVT::i8,
-                             N->isVolatile(), N->isNonTemporal(),
-                             N->getAlignment(), N->getAAInfo());
-  return SDValue();
-}
-
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
Index: test/CodeGen/ARM/atomic-cmpxchg.ll
===================================================================
--- test/CodeGen/ARM/atomic-cmpxchg.ll
+++ test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -1,27 +1,21 @@
-; RUN: llc < %s -mtriple=arm-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARM
-; RUN: llc < %s -mtriple=thumb-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMB
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARM -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumb-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMB -check-prefix=CHECK
 
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARMV6
-; RUN: llc < %s -mtriple=thumbv6-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMBV6
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARMV6 -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv6-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMBV6 -check-prefix=CHECK
 
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARMV7
-; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMBV7
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARMV7 -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -asm-verbose=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMBV7 -check-prefix=CHECK
 
 define zeroext i1 @test_cmpxchg_res_i8(i8* %addr, i8 %desired, i8 zeroext %new) {
-entry:
-  %0 = cmpxchg i8* %addr, i8 %desired, i8 %new monotonic monotonic
-  %1 = extractvalue { i8, i1 } %0, 1
-  ret i1 %1
-}
+; CHECK-LABEL: test_cmpxchg_res_i8:
 
-; CHECK-ARM-LABEL: test_cmpxchg_res_i8
 ; CHECK-ARM: bl __sync_val_compare_and_swap_1
 ; CHECK-ARM: mov [[REG:r[0-9]+]], #0
 ; CHECK-ARM: cmp r0, {{r[0-9]+}}
 ; CHECK-ARM: moveq [[REG]], #1
 ; CHECK-ARM: mov r0, [[REG]]
 
-; CHECK-THUMB-LABEL: test_cmpxchg_res_i8
 ; CHECK-THUMB: bl __sync_val_compare_and_swap_1
 ; CHECK-THUMB-NOT: mov [[R1:r[0-7]]], r0
 ; CHECK-THUMB: push  {r0}
@@ -33,7 +27,6 @@
 ; CHECK-THUMB: push  {[[R2]]}
 ; CHECK-THUMB: pop {r0}
 
-; CHECK-ARMV6-LABEL: test_cmpxchg_res_i8:
 ; CHECK-ARMV6-NEXT:  .fnstart
 ; CHECK-ARMV6-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
 ; CHECK-ARMV6-NEXT: [[TRY:.LBB[0-9_]+]]:
@@ -49,7 +42,6 @@
 ; CHECK-ARMV6-NEXT: mov r0, [[RES]]
 ; CHECK-ARMV6-NEXT: bx lr
 
-; CHECK-THUMBV6-LABEL: test_cmpxchg_res_i8:
 ; CHECK-THUMBV6:       mov [[EXPECTED:r[0-9]+]], r1
 ; CHECK-THUMBV6-NEXT:  bl __sync_val_compare_and_swap_1
 ; CHECK-THUMBV6-NEXT:  mov [[RES:r[0-9]+]], r0
@@ -61,7 +53,6 @@
 ; CHECK-THUMBV6-NEXT: [[END]]:
 ; CHECK-THUMBV6-NEXT:  pop {{.*}}pc}
 
-; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8:
 ; CHECK-ARMV7-NEXT: .fnstart
 ; CHECK-ARMV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
 ; CHECK-ARMV7-NEXT: [[TRY:.LBB[0-9_]+]]:
@@ -80,7 +71,6 @@
 ; CHECK-ARMV7-NEXT: mov r0, [[RES]]
 ; CHECK-ARMV7-NEXT: bx lr
 
-; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
 ; CHECK-THUMBV7-NEXT: .fnstart
 ; CHECK-THUMBV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
 ; CHECK-THUMBV7-NEXT: b [[TRYLD:.LBB[0-9_]+]]
@@ -97,3 +87,31 @@
 ; CHECK-THUMBV7-NEXT: clrex
 ; CHECK-THUMBV7-NEXT: movs r0, #0
 ; CHECK-THUMBV7-NEXT: bx lr
+
+entry:
+  %0 = cmpxchg i8* %addr, i8 %desired, i8 %new monotonic monotonic
+  %1 = extractvalue { i8, i1 } %0, 1
+  ret i1 %1
+}
+
+
+
+;; Also ensure that i64s are inlined or turned into a libcall, as appropriate.
+define zeroext i1 @test_cmpxchg_res_i64(i64* %addr, i64 %desired, i64 zeroext %new) {
+; CHECK-LABEL: test_cmpxchg_res_i64:
+
+; CHECK-ARM: __sync_val_compare_and_swap_8
+; CHECK-THUMB: __sync_val_compare_and_swap_8
+; CHECK-ARMV6: ldrexd
+; CHECK-ARMV6: strexd
+; CHECK-THUMBV6: __sync_val_compare_and_swap_8
+; CHECK-ARMV7: ldrexd
+; CHECK-ARMV7: strexd
+; CHECK-THUMBv7: ldrexd
+; CHECK-THUMBv7: strexd
+
+entry:
+  %0 = cmpxchg i64* %addr, i64 %desired, i64 %new monotonic monotonic
+  %1 = extractvalue { i64, i1 } %0, 1
+  ret i1 %1
+}
Index: test/CodeGen/ARM/atomic-load-store.ll
===================================================================
--- test/CodeGen/ARM/atomic-load-store.ll
+++ test/CodeGen/ARM/atomic-load-store.ll
@@ -12,7 +12,9 @@
 ; ARM-NEXT: str
 ; ARM-NEXT: dmb {{ish$}}
 ; THUMBONE-LABEL: test1
-; THUMBONE: __sync_lock_test_and_set_4
+; THUMBONE: ___sync_synchronize
+; THUMBONE-NEXT: str
+; THUMBONE-NEXT: ___sync_synchronize
 ; THUMBTWO-LABEL: test1
 ; THUMBTWO: dmb {{ish$}}
 ; THUMBTWO-NEXT: str
@@ -34,7 +36,8 @@
 ; ARM: ldr
 ; ARM-NEXT: dmb {{ish$}}
 ; THUMBONE-LABEL: test2
-; THUMBONE: __sync_val_compare_and_swap_4
+; THUMBONE: ldr
+; THUMBONE: __sync_synchronize
 ; THUMBTWO-LABEL: test2
 ; THUMBTWO: ldr
 ; THUMBTWO-NEXT: dmb {{ish$}}
@@ -83,8 +86,11 @@
 
 define void @test4(i8* %ptr1, i8* %ptr2) {
 ; THUMBONE-LABEL: test4
-; THUMBONE: ___sync_val_compare_and_swap_1
-; THUMBONE: ___sync_lock_test_and_set_1
+; THUMBONE:      ldrb
+; THUMBONE-NEXT: ___sync_synchronize
+; THUMBONE-NEXT: ___sync_synchronize
+; THUMBONE-NEXT: strb
+; THUMBONE-NEXT: ___sync_synchronize
 ; ARMV6-LABEL: test4
 ; THUMBM-LABEL: test4
   %val = load atomic i8, i8* %ptr1 seq_cst, align 1
Index: test/CodeGen/ARM/atomic-op.ll
===================================================================
--- test/CodeGen/ARM/atomic-op.ll
+++ test/CodeGen/ARM/atomic-op.ll
@@ -7,6 +7,7 @@
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @func(i32 %argc, i8** %argv) nounwind {
+; CHECK-LABEL: func:
 entry:
 	%argc.addr = alloca i32		; <i32*> [#uses=1]
 	%argv.addr = alloca i8**		; <i8***> [#uses=1]
@@ -153,6 +154,7 @@
 }
 
 define void @func2() nounwind {
+; CHECK-LABEL: func2:
 entry:
   %val = alloca i16
   %old = alloca i16
@@ -194,6 +196,7 @@
 }
 
 define void @func3() nounwind {
+; CHECK-LABEL: func3:
 entry:
   %val = alloca i8
   %old = alloca i8
@@ -234,7 +237,7 @@
   ret void
 }
 
-; CHECK: func4
+; CHECK-LABEL: func4:
 ; This function should not need to use callee-saved registers.
 ; rdar://problem/12203728
 ; CHECK-NOT: r4
@@ -246,7 +249,6 @@
 
 define i32 @test_cmpxchg_fail_order(i32 *%addr, i32 %desired, i32 %new) {
 ; CHECK-LABEL: test_cmpxchg_fail_order:
-
   %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
   %oldval = extractvalue { i32, i1 } %pair, 0
 ; CHECK-ARMV7:     ldrex   [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
@@ -286,7 +288,6 @@
 
 define i32 @test_cmpxchg_fail_order1(i32 *%addr, i32 %desired, i32 %new) {
 ; CHECK-LABEL: test_cmpxchg_fail_order1:
-
   %pair = cmpxchg i32* %addr, i32 %desired, i32 %new acquire acquire
   %oldval = extractvalue { i32, i1 } %pair, 0
 ; CHECK-NOT:     dmb ish
@@ -308,7 +309,7 @@
 }
 
 define i32 @load_load_add_acquire(i32* %mem1, i32* %mem2) nounwind {
-; CHECK-LABEL: load_load_add_acquire
+; CHECK-LABEL: load_load_add_acquire:
   %val1 = load atomic i32, i32* %mem1 acquire, align 4
   %val2 = load atomic i32, i32* %mem2 acquire, align 4
   %tmp = add i32 %val1, %val2
@@ -332,7 +333,7 @@
 }
 
 define void @store_store_release(i32* %mem1, i32 %val1, i32* %mem2, i32 %val2) {
-; CHECK-LABEL: store_store_release
+; CHECK-LABEL: store_store_release:
   store atomic i32 %val1, i32* %mem1 release, align 4
   store atomic i32 %val2, i32* %mem2 release, align 4
 
@@ -341,19 +342,21 @@
 ; CHECK: dmb
 ; CHECK: str r3, [r2]
 
-; CHECK-T1: ___sync_lock_test_and_set
-; CHECK-T1: ___sync_lock_test_and_set
+; CHECK-M0: dmb
+; CHECK-M0: str r1, [r0]
+; CHECK-M0: dmb
+; CHECK-M0: str r3, [r2]
 
 ; CHECK-BAREMETAL-NOT: dmb
-; CHECK-BAREMTEAL: str r1, [r0]
+; CHECK-BAREMETAL: str r1, [r0]
 ; CHECK-BAREMETAL-NOT: dmb
-; CHECK-BAREMTEAL: str r3, [r2]
+; CHECK-BAREMETAL: str r3, [r2]
 
   ret void
 }
 
 define void @load_fence_store_monotonic(i32* %mem1, i32* %mem2) {
-; CHECK-LABEL: load_fence_store_monotonic
+; CHECK-LABEL: load_fence_store_monotonic:
   %val = load atomic i32, i32* %mem1 monotonic, align 4
   fence seq_cst
   store atomic i32 %val, i32* %mem2 monotonic, align 4
Index: test/CodeGen/PowerPC/atomics-indexed.ll
===================================================================
--- test/CodeGen/PowerPC/atomics-indexed.ll
+++ test/CodeGen/PowerPC/atomics-indexed.ll
@@ -34,8 +34,8 @@
 }
 define i64 @load_x_i64_unordered([100000 x i64]* %mem) {
 ; CHECK-LABEL: load_x_i64_unordered
-; PPC32: __sync_
-; PPC64-NOT: __sync_
+; PPC32: __atomic_
+; PPC64-NOT: __atomic_
 ; PPC64: ldx
 ; CHECK-NOT: sync
   %ptr = getelementptr inbounds [100000 x i64], [100000 x i64]* %mem, i64 0, i64 90000
@@ -71,8 +71,8 @@
 define void @store_x_i64_unordered([100000 x i64]* %mem) {
 ; CHECK-LABEL: store_x_i64_unordered
 ; CHECK-NOT: sync
-; PPC32: __sync_
-; PPC64-NOT: __sync_
+; PPC32: __atomic_
+; PPC64-NOT: __atomic_
 ; PPC64: stdx
   %ptr = getelementptr inbounds [100000 x i64], [100000 x i64]* %mem, i64 0, i64 90000
   store atomic i64 42, i64* %ptr unordered, align 8
Index: test/CodeGen/PowerPC/atomics.ll
===================================================================
--- test/CodeGen/PowerPC/atomics.ll
+++ test/CodeGen/PowerPC/atomics.ll
@@ -32,12 +32,12 @@
 }
 define i64 @load_i64_seq_cst(i64* %mem) {
 ; CHECK-LABEL: load_i64_seq_cst
-; CHECK: sync
-; PPC32: __sync_
-; PPC64-NOT: __sync_
+; PPC32: __atomic_
+; PPC64-NOT: __atomic_
+; PPC64: sync
 ; PPC64: ld
   %val = load atomic i64, i64* %mem seq_cst, align 8
-; CHECK: lwsync
+; PPC64: lwsync
   ret i64 %val
 }
 
@@ -65,9 +65,9 @@
 }
 define void @store_i64_seq_cst(i64* %mem) {
 ; CHECK-LABEL: store_i64_seq_cst
-; CHECK: sync
-; PPC32: __sync_
-; PPC64-NOT: __sync_
+; PPC32: __atomic_
+; PPC64-NOT: __atomic_
+; PPC64: sync
 ; PPC64: std
   store atomic i64 42, i64* %mem seq_cst, align 8
   ret void
@@ -100,7 +100,8 @@
 }
 define i64 @cas_weak_i64_release_monotonic(i64* %mem) {
 ; CHECK-LABEL: cas_weak_i64_release_monotonic
-; CHECK: lwsync
+; PPC32: __atomic_
+; PPC64: lwsync
   %val = cmpxchg weak i64* %mem, i64 0, i64 1 release monotonic
 ; CHECK-NOT: [sync ]
   %loaded = extractvalue { i64, i1} %val, 0
@@ -130,7 +131,8 @@
 }
 define i64 @and_i64_release(i64* %mem, i64 %operand) {
 ; CHECK-LABEL: and_i64_release
-; CHECK: lwsync
+; PPC32: __atomic_
+; PPC64: lwsync
   %val = atomicrmw and i64* %mem, i64 %operand release
 ; CHECK-NOT: [sync ]
   ret i64 %val
Index: test/CodeGen/X86/atomic-non-integer.ll
===================================================================
--- test/CodeGen/X86/atomic-non-integer.ll
+++ test/CodeGen/X86/atomic-non-integer.ll
@@ -34,7 +34,7 @@
 
 define void @store_fp128(fp128* %fptr, fp128 %v) {
 ; CHECK-LABEL: @store_fp128
-; CHECK: callq	__sync_lock_test_and_set_16
+; CHECK: callq	__atomic_store_16
   store atomic fp128 %v, fp128* %fptr unordered, align 16
   ret void
 }
@@ -66,7 +66,7 @@
 
 define fp128 @load_fp128(fp128* %fptr) {
 ; CHECK-LABEL: @load_fp128
-; CHECK: callq	__sync_val_compare_and_swap_16
+; CHECK: callq	__atomic_load_16
   %v = load atomic fp128, fp128* %fptr unordered, align 16
   ret fp128 %v
 }
Index: test/CodeGen/X86/nocx16.ll
===================================================================
--- test/CodeGen/X86/nocx16.ll
+++ test/CodeGen/X86/nocx16.ll
@@ -1,21 +1,21 @@
 ; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=-cx16 | FileCheck %s
 define void @test(i128* %a) nounwind {
 entry:
-; CHECK: __sync_val_compare_and_swap_16
+; CHECK: __atomic_compare_exchange_16
   %0 = cmpxchg i128* %a, i128 1, i128 1 seq_cst seq_cst
-; CHECK: __sync_lock_test_and_set_16
+; CHECK: __atomic_exchange_16
   %1 = atomicrmw xchg i128* %a, i128 1 seq_cst
-; CHECK: __sync_fetch_and_add_16
+; CHECK: __atomic_fetch_add_16
   %2 = atomicrmw add i128* %a, i128 1 seq_cst
-; CHECK: __sync_fetch_and_sub_16
+; CHECK: __atomic_fetch_sub_16
   %3 = atomicrmw sub i128* %a, i128 1 seq_cst
-; CHECK: __sync_fetch_and_and_16
+; CHECK: __atomic_fetch_and_16
   %4 = atomicrmw and i128* %a, i128 1 seq_cst
-; CHECK: __sync_fetch_and_nand_16
+; CHECK: __atomic_fetch_nand_16
   %5 = atomicrmw nand i128* %a, i128 1 seq_cst
-; CHECK: __sync_fetch_and_or_16
+; CHECK: __atomic_fetch_or_16
   %6 = atomicrmw or i128* %a, i128 1 seq_cst
-; CHECK: __sync_fetch_and_xor_16
+; CHECK: __atomic_fetch_xor_16
   %7 = atomicrmw xor i128* %a, i128 1 seq_cst
   ret void
 }
Index: test/CodeGen/XCore/atomic.ll
===================================================================
--- test/CodeGen/XCore/atomic.ll
+++ test/CodeGen/XCore/atomic.ll
@@ -21,71 +21,11 @@
 entry:
 ; CHECK-LABEL: atomicloadstore
 
-; CHECK: ldw r[[R0:[0-9]+]], dp[pool]
-; CHECK-NEXT: ldaw r[[R1:[0-9]+]], dp[pool]
-; CHECK-NEXT: #MEMBARRIER
-; CHECK-NEXT: ldc r[[R2:[0-9]+]], 0
+; CHECK: bl __atomic_load_4
   %0 = load atomic i32, i32* bitcast (i64* @pool to i32*) acquire, align 4
 
-; CHECK-NEXT: ld16s r3, r[[R1]][r[[R2]]]
-; CHECK-NEXT: #MEMBARRIER
-  %1 = load atomic i16, i16* bitcast (i64* @pool to i16*) acquire, align 2
-
-; CHECK-NEXT: ld8u r11, r[[R1]][r[[R2]]]
-; CHECK-NEXT: #MEMBARRIER
-  %2 = load atomic i8, i8* bitcast (i64* @pool to i8*) acquire, align 1
-
-; CHECK-NEXT: ldw r4, dp[pool]
-; CHECK-NEXT: #MEMBARRIER
-  %3 = load atomic i32, i32* bitcast (i64* @pool to i32*) seq_cst, align 4
-
-; CHECK-NEXT: ld16s r5, r[[R1]][r[[R2]]]
-; CHECK-NEXT: #MEMBARRIER
-  %4 = load atomic i16, i16* bitcast (i64* @pool to i16*) seq_cst, align 2
-
-; CHECK-NEXT: ld8u r6, r[[R1]][r[[R2]]]
-; CHECK-NEXT: #MEMBARRIER
-  %5 = load atomic i8, i8* bitcast (i64* @pool to i8*) seq_cst, align 1
-
-; CHECK-NEXT: #MEMBARRIER
-; CHECK-NEXT: stw r[[R0]], dp[pool]
-  store atomic i32 %0, i32* bitcast (i64* @pool to i32*) release, align 4
-
-; CHECK-NEXT: #MEMBARRIER
-; CHECK-NEXT: st16 r3, r[[R1]][r[[R2]]]
-  store atomic i16 %1, i16* bitcast (i64* @pool to i16*) release, align 2
-
-; CHECK-NEXT: #MEMBARRIER
-; CHECK-NEXT: st8 r11, r[[R1]][r[[R2]]]
-  store atomic i8 %2, i8* bitcast (i64* @pool to i8*) release, align 1
-
-; CHECK-NEXT: #MEMBARRIER
-; CHECK-NEXT: stw r4, dp[pool]
-; CHECK-NEXT: #MEMBARRIER
-  store atomic i32 %3, i32* bitcast (i64* @pool to i32*) seq_cst, align 4
-
-; CHECK-NEXT: #MEMBARRIER
-; CHECK-NEXT: st16 r5, r[[R1]][r[[R2]]]
-; CHECK-NEXT: #MEMBARRIER
-  store atomic i16 %4, i16* bitcast (i64* @pool to i16*) seq_cst, align 2
-
-; CHECK-NEXT: #MEMBARRIER
-; CHECK-NEXT: st8 r6, r[[R1]][r[[R2]]]
-; CHECK-NEXT: #MEMBARRIER
-  store atomic i8 %5, i8* bitcast (i64* @pool to i8*) seq_cst, align 1
-
-; CHECK-NEXT: ldw r[[R0]], dp[pool]
-; CHECK-NEXT: stw r[[R0]], dp[pool]
-; CHECK-NEXT: ld16s r[[R0]], r[[R1]][r[[R2]]]
-; CHECK-NEXT: st16 r[[R0]], r[[R1]][r[[R2]]]
-; CHECK-NEXT: ld8u r[[R0]], r[[R1]][r[[R2]]]
-; CHECK-NEXT: st8 r[[R0]], r[[R1]][r[[R2]]]
-  %6 = load atomic i32, i32* bitcast (i64* @pool to i32*) monotonic, align 4
-  store atomic i32 %6, i32* bitcast (i64* @pool to i32*) monotonic, align 4
-  %7 = load atomic i16, i16* bitcast (i64* @pool to i16*) monotonic, align 2
-  store atomic i16 %7, i16* bitcast (i64* @pool to i16*) monotonic, align 2
-  %8 = load atomic i8, i8* bitcast (i64* @pool to i8*) monotonic, align 1
-  store atomic i8 %8, i8* bitcast (i64* @pool to i8*) monotonic, align 1
+; CHECK: bl __atomic_store_2
+  store atomic i16 5, i16* bitcast (i64* @pool to i16*) release, align 2
 
   ret void
 }