diff --git a/llvm/include/llvm/CodeGen/AtomicLoopBundler.h b/llvm/include/llvm/CodeGen/AtomicLoopBundler.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/AtomicLoopBundler.h
@@ -0,0 +1,110 @@
+//===--- AtomicLoopBundler.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass bundles the basic block created by AtomicExpand so that the
+/// Fast Register Allocator cannot insert spills between the exclusive load and
+/// stores, which clears the exclusive monitor (and causes an infinite loop).
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_ATOMICLOOPBUNDLER_H
+#define LLVM_CODEGEN_ATOMICLOOPBUNDLER_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+
+#define DEBUG_TYPE "atomic-loop-bundler"
+
+namespace llvm {
+
+/// Bundle instructions between exclusive loads and stores that were inserted by
+/// the Atomic Exand Pass.
+/// \param Derived must specify provide the following two predicate functions,
+/// which indicate when a machine instruction is a relevant load or store:
+///     static bool isExclusiveLoad(const MachineInstr &MI);
+///     static bool isExclusiveStore(const MachineInstr &MI);
+/// A bundle will be inserted in appropriate blocks between the first identified
+/// exclusive load and the next occurring exclusive store.
+template <typename Derived>
+class AtomicLoopBundler : public MachineFunctionPass {
+private:
+  bool bundleBlock(MachineBasicBlock &MBB) {
+    // One of the basic blocks inserted by AtomicExpandPass looks like this:
+    // atomicrmw.start:
+    //     %loaded = @load.linked(%addr)
+    //     %new = some_op iN %loaded, %incr
+    //     %stored = @store_conditional(%new, %addr)
+    //     %try_again = icmp i32 ne %stored, 0
+    //     br i1 %try_again, label %loop, label %atomicrmw.end
+    if (!MBB.getName().contains("atomicrmw.start"))
+      return false;
+
+    // Search for the exclusive load
+    MachineBasicBlock::instr_iterator LdIter = std::find_if(
+        MBB.instr_begin(), MBB.instr_end(), Derived::isExclusiveLoad);
+
+    // cmpxchg is expanded into a pseudo instruction CMP_SWAP_*. It can also be
+    // inserted by atomic loop expansion for floating point types. If we have a
+    // cmpxchg we won't see an exclusive load here, and don't need to do
+    // anything.
+    // FIXME: We could handle cmpxchg with bundles as well (remove the pseudos).
+    if (LdIter == MBB.end())
+      return false;
+
+    // Check we haven't already bundled
+    if (LdIter->isBundled())
+      return false;
+
+    // Search for the exclusive store
+    MachineBasicBlock::instr_iterator StrIter =
+        std::find_if(LdIter, MBB.instr_end(), Derived::isExclusiveStore);
+
+    assert(StrIter != MBB.end() &&
+           "Failed to find exclusive store in atomicrmw.start block");
+    if (StrIter == MBB.end())
+      return false;
+
+    // Create a finalized bundle ready for register allocation.
+    finalizeBundle(MBB, LdIter, std::next(StrIter));
+
+    // Print some info
+    LLVM_DEBUG(dbgs() << "Created bundle with "
+                      << std::distance(LdIter, StrIter)
+                      << " instructions between\n"
+                         "     "
+                      << *LdIter << "\n"
+                      << " and " << *StrIter << ".\n";);
+    return true;
+  }
+
+public:
+  static char ID;
+
+  // Base class must initialize LdOpcodes and StrOpcodes
+  AtomicLoopBundler(char &ID) : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // If the ISel pipeline failed, do not bother running this pass.
+    if (MF.getProperties().hasProperty(
+            MachineFunctionProperties::Property::FailedISel))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "Bundle Atomic Loops for: " << MF.getName() << '\n');
+
+    bool Changed = false;
+    for (MachineFunction::iterator I = MF.begin(); I != MF.end(); ++I) {
+      Changed |= bundleBlock(*I);
+    }
+    return Changed;
+  }
+};
+
+} // End namespace llvm.
+
+#undef DEBUG_TYPE
+
+#endif
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -312,18 +312,18 @@
   return FrameIdx;
 }
 
-static bool dominates(MachineBasicBlock &MBB,
-                      MachineBasicBlock::const_iterator A,
-                      MachineBasicBlock::const_iterator B) {
-  auto MBBEnd = MBB.end();
+static bool dominates(const MachineBasicBlock &MBB,
+                      MachineBasicBlock::const_instr_iterator A,
+                      MachineBasicBlock::const_instr_iterator B) {
+  MachineBasicBlock::const_instr_iterator MBBEnd = MBB.instr_end();
   if (B == MBBEnd)
     return true;
 
-  MachineBasicBlock::const_iterator I = MBB.begin();
-  for (; &*I != A && &*I != B; ++I)
+  MachineBasicBlock::const_instr_iterator I = MBB.instr_begin();
+  for (; I != A && I != B; ++I)
     ;
 
-  return &*I == A;
+  return I == A;
 }
 
 /// Returns false if \p VirtReg is known to not live out of the current block.
@@ -1090,6 +1090,13 @@
   UsedInInstr.clear();
   BundleVirtRegsMap.clear();
 
+  // If a bundle contains a virtual register def followed by a use of another,
+  // they must not be allocated the same physical register. The alternative
+  // case, i.e. when the def is not followed by any use within the bundle, is
+  // probably uncommon enough to ignore for now. Hence we treat any def in a
+  // bundle like an early-clobber.
+  const bool IsBundle = MI.getOpcode() == TargetOpcode::BUNDLE;
+
   // Scan for special cases; Apply pre-assigned register defs to state.
   bool HasPhysRegUse = false;
   bool HasRegMask = false;
@@ -1110,6 +1117,8 @@
           }
           if (MO.isTied() || (MO.getSubReg() != 0 && !MO.isUndef()))
             NeedToAssignLiveThroughs = true;
+          if (IsBundle)
+            NeedToAssignLiveThroughs = true;
         }
       } else if (Reg.isPhysical()) {
         if (!MRI->isReserved(Reg)) {
@@ -1208,7 +1217,7 @@
           LLVM_DEBUG(dbgs() << "Allocating " << MO << '\n');
           unsigned Reg = MO.getReg();
           if (MO.isEarlyClobber() || MO.isTied() ||
-              (MO.getSubReg() && !MO.isUndef())) {
+              (MO.getSubReg() && !MO.isUndef()) || IsBundle) {
             defineLiveThroughVirtReg(MI, OpIdx, Reg);
           } else {
             defineVirtReg(MI, OpIdx, Reg);
@@ -1243,7 +1252,7 @@
       }
 
       // Do not free tied operands and early clobbers.
-      if (MO.isTied() || MO.isEarlyClobber())
+      if (MO.isTied() || MO.isEarlyClobber() || IsBundle)
         continue;
       Register Reg = MO.getReg();
       if (!Reg)
@@ -1333,10 +1342,10 @@
   }
 
   // Free early clobbers.
-  if (HasEarlyClobber) {
+  if (HasEarlyClobber || IsBundle) {
     for (unsigned I = MI.getNumOperands(); I-- > 0; ) {
       MachineOperand &MO = MI.getOperand(I);
-      if (!MO.isReg() || !MO.isDef() || !MO.isEarlyClobber())
+      if (!MO.isReg() || !MO.isDef() || !(MO.isEarlyClobber() || IsBundle))
         continue;
       // subreg defs don't free the full register. We left the subreg number
       // around as a marker in setPhysReg() to recognize this case here.
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -69,6 +69,7 @@
 void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
 void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
+void initializeAArch64AtomicLoopBundlerPass(PassRegistry &);
 void initializeAArch64BranchTargetsPass(PassRegistry&);
 void initializeAArch64CollectLOHPass(PassRegistry&);
 void initializeAArch64CondBrTuningPass(PassRegistry &);
diff --git a/llvm/lib/Target/AArch64/AArch64AtomicLoopBundler.h b/llvm/lib/Target/AArch64/AArch64AtomicLoopBundler.h
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64AtomicLoopBundler.h
@@ -0,0 +1,38 @@
+//===--- AArch64AtomicLoopBundler.h -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Implements AtomicLoopBundler for AArch64.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ATOMICLOOPBUNDLER_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64ATOMICLOOPBUNDLER_H
+
+#include "llvm/CodeGen/AtomicLoopBundler.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class AArch64AtomicLoopBundler
+    : public AtomicLoopBundler<AArch64AtomicLoopBundler> {
+
+public:
+  static char ID;
+
+  static bool isExclusiveLoad(const MachineInstr &MI);
+  static bool isExclusiveStore(const MachineInstr &MI);
+
+  StringRef getPassName() const override {
+    return "AArch64 Atomic Loop Bundler";
+  }
+
+  AArch64AtomicLoopBundler()
+      : AtomicLoopBundler<AArch64AtomicLoopBundler>(ID){};
+};
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/AArch64/AArch64AtomicLoopBundler.cpp b/llvm/lib/Target/AArch64/AArch64AtomicLoopBundler.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64AtomicLoopBundler.cpp
@@ -0,0 +1,66 @@
+//===--- AArch64AtomicLoopBundler.cpp ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Implements AtomicLoopBundler for AArch64.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64AtomicLoopBundler.h"
+#include "AArch64.h"
+
+using namespace llvm;
+
+char AArch64AtomicLoopBundler::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+    AArch64AtomicLoopBundler, "aarch64-atomic-loop-bundler",
+    "Bundle exclusive loads and stores created by atomic loop expansion", false,
+    false)
+INITIALIZE_PASS_END(
+    AArch64AtomicLoopBundler, "aarch64-atomic-loop-bundler",
+    "Bundle exclusive loads and stores created by atomic loop expansion", false,
+    false)
+
+bool AArch64AtomicLoopBundler::isExclusiveLoad(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AArch64::LDAXRW:
+  case AArch64::LDAXRX:
+  case AArch64::LDAXRB:
+  case AArch64::LDAXRH:
+  case AArch64::LDXRW:
+  case AArch64::LDXRX:
+  case AArch64::LDXRB:
+  case AArch64::LDXRH:
+  case AArch64::LDAXPW:
+  case AArch64::LDAXPX:
+  case AArch64::LDXPW:
+  case AArch64::LDXPX:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool AArch64AtomicLoopBundler::isExclusiveStore(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AArch64::STLXRW:
+  case AArch64::STLXRX:
+  case AArch64::STLXRB:
+  case AArch64::STLXRH:
+  case AArch64::STXRW:
+  case AArch64::STXRX:
+  case AArch64::STXRB:
+  case AArch64::STXRH:
+  case AArch64::STLXPW:
+  case AArch64::STLXPX:
+  case AArch64::STXPW:
+  case AArch64::STXPX:
+    return true;
+  default:
+    return false;
+  }
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -11,6 +11,7 @@
 
 #include "AArch64TargetMachine.h"
 #include "AArch64.h"
+#include "AArch64AtomicLoopBundler.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64MacroFusion.h"
 #include "AArch64Subtarget.h"
@@ -175,6 +176,7 @@
   initializeAArch64A53Fix835769Pass(*PR);
   initializeAArch64A57FPLoadBalancingPass(*PR);
   initializeAArch64AdvSIMDScalarPass(*PR);
+  initializeAArch64AtomicLoopBundlerPass(*PR);
   initializeAArch64BranchTargetsPass(*PR);
   initializeAArch64CollectLOHPass(*PR);
   initializeAArch64CompressJumpTablesPass(*PR);
@@ -426,6 +428,7 @@
   bool addRegBankSelect() override;
   void addPreGlobalInstructionSelect() override;
   bool addGlobalInstructionSelect() override;
+  void addFastRegAlloc() override;
   bool addILPOpts() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
@@ -594,6 +597,22 @@
   return false;
 }
 
+void AArch64PassConfig::addFastRegAlloc() {
+  // Bundles must be finalized (register defs/uses added to the BUNDLE MI)
+  // before register allocation, because the register allocator looks at only
+  // top level MachineInstructions, not the contents of the bunde. However,
+  // this can't be done in SSA form as it creates multiple definitions for
+  // virtual registers which will fail validation. Must also be done after
+  // two-address instruction expansion, which removes REG_SEQUENCE.
+  insertPass(&TwoAddressInstructionPassID, &AArch64AtomicLoopBundler::ID);
+
+  TargetPassConfig::addFastRegAlloc();
+
+  // Remove the bundles created by AtomicLoopBundler; otherwise instructions
+  // inside the bundle will not be lowered correctly.
+  addPass(createUnpackMachineBundles(nullptr));
+}
+
 bool AArch64PassConfig::addILPOpts() {
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -40,6 +40,7 @@
   AArch64A57FPLoadBalancing.cpp
   AArch64AdvSIMDScalarPass.cpp
   AArch64AsmPrinter.cpp
+  AArch64AtomicLoopBundler.cpp
   AArch64BranchTargets.cpp
   AArch64CallingConvention.cpp
   AArch64CleanupLocalDynamicTLSPass.cpp
diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h
--- a/llvm/lib/Target/ARM/ARM.h
+++ b/llvm/lib/Target/ARM/ARM.h
@@ -62,6 +62,7 @@
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
 
+void initializeARMAtomicLoopBundlerPass(PassRegistry &);
 void initializeARMParallelDSPPass(PassRegistry &);
 void initializeARMLoadStoreOptPass(PassRegistry &);
 void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
diff --git a/llvm/lib/Target/ARM/ARMAtomicLoopBundler.h b/llvm/lib/Target/ARM/ARMAtomicLoopBundler.h
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/ARM/ARMAtomicLoopBundler.h
@@ -0,0 +1,34 @@
+//===--- ARMAtomicLoopBundler.h ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Implements AtomicLoopBundler for ARM.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMATOMICLOOPBUNDLER_H
+#define LLVM_LIB_TARGET_ARM_ARMATOMICLOOPBUNDLER_H
+
+#include "llvm/CodeGen/AtomicLoopBundler.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class ARMAtomicLoopBundler : public AtomicLoopBundler<ARMAtomicLoopBundler> {
+
+public:
+  static char ID;
+
+  static bool isExclusiveLoad(const MachineInstr &MI);
+  static bool isExclusiveStore(const MachineInstr &MI);
+
+  StringRef getPassName() const override { return "ARM Atomic Loop Bundler"; }
+
+  ARMAtomicLoopBundler() : AtomicLoopBundler<ARMAtomicLoopBundler>(ID){};
+};
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/ARM/ARMAtomicLoopBundler.cpp b/llvm/lib/Target/ARM/ARMAtomicLoopBundler.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/ARM/ARMAtomicLoopBundler.cpp
@@ -0,0 +1,60 @@
+//===--- ARMAtomicLoopBundler.cpp -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Implements AtomicLoopBundler for ARM.
+//===----------------------------------------------------------------------===//
+
+#include "ARMAtomicLoopBundler.h"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+
+using namespace llvm;
+
+char ARMAtomicLoopBundler::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+    ARMAtomicLoopBundler, "arm-atomic-loop-bundler",
+    "Bundle exclusive loads and stores created by atomic loop expansion", false,
+    false)
+INITIALIZE_PASS_END(
+    ARMAtomicLoopBundler, "arm-atomic-loop-bundler",
+    "Bundle exclusive loads and stores created by atomic loop expansion", false,
+    false)
+
+// TODO add LoadAquire/StoreRelease instructions?
+bool ARMAtomicLoopBundler::isExclusiveLoad(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case ARM::t2LDREX:
+  case ARM::t2LDREXB:
+  case ARM::t2LDREXD:
+  case ARM::t2LDREXH:
+  case ARM::LDREX:
+  case ARM::LDREXB:
+  case ARM::LDREXD:
+  case ARM::LDREXH:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool ARMAtomicLoopBundler::isExclusiveStore(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case ARM::t2STREX:
+  case ARM::t2STREXB:
+  case ARM::t2STREXD:
+  case ARM::t2STREXH:
+  case ARM::STREX:
+  case ARM::STREXB:
+  case ARM::STREXD:
+  case ARM::STREXH:
+    return true;
+  default:
+    return false;
+  }
+}
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -11,6 +11,7 @@
 
 #include "ARMTargetMachine.h"
 #include "ARM.h"
+#include "ARMAtomicLoopBundler.h"
 #include "ARMMacroFusion.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetObjectFile.h"
@@ -88,6 +89,7 @@
 
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeGlobalISel(Registry);
+  initializeARMAtomicLoopBundlerPass(Registry);
   initializeARMLoadStoreOptPass(Registry);
   initializeARMPreAllocLoadStoreOptPass(Registry);
   initializeARMParallelDSPPass(Registry);
@@ -364,6 +366,7 @@
   bool addLegalizeMachineIR() override;
   bool addRegBankSelect() override;
   bool addGlobalInstructionSelect() override;
+  void addFastRegAlloc() override;
   void addPreRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
@@ -491,6 +494,22 @@
   return false;
 }
 
+void ARMPassConfig::addFastRegAlloc() {
+  // Bundles must be finalized (register defs/uses added to the BUNDLE MI)
+  // before register allocation, because the register allocator looks at only
+  // top level MachineInstructions, not the contents of the bunde. However,
+  // this can't be done in SSA form as it creates multiple definitions for
+  // virtual registers which will fail validation. Must also be done after
+  // two-address instruction expansion, which removes REG_SEQUENCE.
+  insertPass(&TwoAddressInstructionPassID, &ARMAtomicLoopBundler::ID);
+
+  TargetPassConfig::addFastRegAlloc();
+
+  // Remove the bundles created by AtomicLoopBundler; otherwise instructions
+  // inside the bundle will not be lowered correctly.
+  addPass(createUnpackMachineBundles(nullptr));
+}
+
 void ARMPassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOpt::None) {
     addPass(createMVETPAndVPTOptimisationsPass());
diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt
--- a/llvm/lib/Target/ARM/CMakeLists.txt
+++ b/llvm/lib/Target/ARM/CMakeLists.txt
@@ -22,6 +22,7 @@
 add_llvm_target(ARMCodeGen
   A15SDOptimizer.cpp
   ARMAsmPrinter.cpp
+  ARMAtomicLoopBundler.cpp
   ARMBaseInstrInfo.cpp
   ARMBaseRegisterInfo.cpp
   ARMBasicBlockInfo.cpp
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -46,7 +46,9 @@
 ; CHECK-NEXT:       Local Stack Slot Allocation
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
 ; CHECK-NEXT:       Two-Address instruction pass
+; CHECK-NEXT:       AArch64 Atomic Loop Bundler
 ; CHECK-NEXT:       Fast Register Allocator
+; CHECK-NEXT:       Unpack machine instruction bundles
 ; CHECK-NEXT:       Fixup Statepoint Caller Saved
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw_exclusive_monitor.ll b/llvm/test/CodeGen/AArch64/atomicrmw_exclusive_monitor.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/atomicrmw_exclusive_monitor.ll
@@ -0,0 +1,471 @@
+; RUN: llc -O0 -o - %s | FileCheck %s --check-prefix=CHECK
+target triple = "aarch64-none-eabi"
+
+@atomic_i8 = external global i8
+@atomic_i16 = external global i16
+@atomic_i32 = external global i32
+@atomic_i64 = external global i64
+
+@atomic_half = external global half
+@atomic_float = external global float
+@atomic_double = external global double
+
+
+define i8 @test_xchg_i8() {
+entry:
+  %0  = atomicrmw xchg  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_xchg_i8:
+  ; CHECK: ldxrb   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrb   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_add_i8() {
+entry:
+  %0  = atomicrmw add  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_add_i8:
+  ; CHECK: ldxrb   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrb   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_sub_i8() {
+entry:
+  %0  = atomicrmw sub  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_sub_i8:
+  ; CHECK: ldxrb   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrb   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_and_i8() {
+entry:
+  %0  = atomicrmw and  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_and_i8:
+  ; CHECK: ldxrb   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrb   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_nand_i8() {
+entry:
+  %0  = atomicrmw nand  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_nand_i8:
+  ; CHECK: ldxrb   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrb   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_or_i8() {
+entry:
+  %0  = atomicrmw or  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_or_i8:
+  ; CHECK: ldxrb   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrb   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_xor_i8() {
+entry:
+  %0  = atomicrmw xor  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_xor_i8:
+  ; CHECK: ldxrb   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrb   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_max_i8() {
+entry:
+  %0  = atomicrmw max  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_max_i8:
+  ; CHECK: ldxrb   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrb   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_min_i8() {
+entry:
+  %0  = atomicrmw min  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_min_i8:
+  ; CHECK: ldxrb   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrb   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_umax_i8() {
+entry:
+  %0  = atomicrmw umax  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_umax_i8:
+  ; CHECK: ldxrb   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrb   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_umin_i8() {
+entry:
+  %0  = atomicrmw umin  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_umin_i8:
+  ; CHECK: ldxrb   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrb   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+
+
+define i16 @test_xchg_i16() {
+entry:
+  %0  = atomicrmw xchg  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_xchg_i16:
+  ; CHECK: ldxrh   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrh   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_add_i16() {
+entry:
+  %0  = atomicrmw add  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_add_i16:
+  ; CHECK: ldxrh   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrh   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_sub_i16() {
+entry:
+  %0  = atomicrmw sub  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_sub_i16:
+  ; CHECK: ldxrh   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrh   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_and_i16() {
+entry:
+  %0  = atomicrmw and  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_and_i16:
+  ; CHECK: ldxrh   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrh   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_nand_i16() {
+entry:
+  %0  = atomicrmw nand  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_nand_i16:
+  ; CHECK: ldxrh   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrh   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_or_i16() {
+entry:
+  %0  = atomicrmw or  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_or_i16:
+  ; CHECK: ldxrh   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrh   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_xor_i16() {
+entry:
+  %0  = atomicrmw xor  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_xor_i16:
+  ; CHECK: ldxrh   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrh   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_max_i16() {
+entry:
+  %0  = atomicrmw max  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_max_i16:
+  ; CHECK: ldxrh   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrh   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_min_i16() {
+entry:
+  %0  = atomicrmw min  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_min_i16:
+  ; CHECK: ldxrh   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrh   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_umax_i16() {
+entry:
+  %0  = atomicrmw umax  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_umax_i16:
+  ; CHECK: ldxrh   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrh   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_umin_i16() {
+entry:
+  %0  = atomicrmw umin  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_umin_i16:
+  ; CHECK: ldxrh   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxrh   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define half @test_fadd_half() {
+entry:
+  %0  = atomicrmw fadd  half* @atomic_half, half 1.0 monotonic
+  ; CHECK-LABEL: test_fadd_half:
+  ; CHECK: ldaxrh  {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stlxrh  {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret half %0
+}
+define half @test_fsub_half() {
+entry:
+  %0  = atomicrmw fsub  half* @atomic_half, half 1.0 monotonic
+  ; CHECK-LABEL: test_fsub_half:
+  ; CHECK: ldaxrh  {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stlxrh  {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret half %0
+}
+
+
+define i32 @test_xchg_i32() {
+entry:
+  %0  = atomicrmw xchg  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_xchg_i32:
+  ; CHECK: ldxr   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_add_i32() {
+entry:
+  %0  = atomicrmw add  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_add_i32:
+  ; CHECK: ldxr   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_sub_i32() {
+entry:
+  %0  = atomicrmw sub  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_sub_i32:
+  ; CHECK: ldxr   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_and_i32() {
+entry:
+  %0  = atomicrmw and  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_and_i32:
+  ; CHECK: ldxr   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_nand_i32() {
+entry:
+  %0  = atomicrmw nand  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_nand_i32:
+  ; CHECK: ldxr   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_or_i32() {
+entry:
+  %0  = atomicrmw or  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_or_i32:
+  ; CHECK: ldxr   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_xor_i32() {
+entry:
+  %0  = atomicrmw xor  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_xor_i32:
+  ; CHECK: ldxr   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_max_i32() {
+entry:
+  %0  = atomicrmw max  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_max_i32:
+  ; CHECK: ldxr   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_min_i32() {
+entry:
+  %0  = atomicrmw min  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_min_i32:
+  ; CHECK: ldxr   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_umax_i32() {
+entry:
+  %0  = atomicrmw umax  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_umax_i32:
+  ; CHECK: ldxr   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_umin_i32() {
+entry:
+  %0  = atomicrmw umin  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_umin_i32:
+  ; CHECK: ldxr   {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr   {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define float @test_fadd_float() {
+entry:
+  %0  = atomicrmw fadd  float* @atomic_float, float 1.0 monotonic
+  ; CHECK-LABEL: test_fadd_float:
+  ; CHECK: ldaxr  {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stlxr  {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret float %0
+}
+define float @test_fsub_float() {
+entry:
+  %0  = atomicrmw fsub  float* @atomic_float, float 1.0 monotonic
+  ; CHECK-LABEL: test_fsub_float:
+  ; CHECK: ldaxr  {{w[0-9]+}}, [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stlxr  {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret float %0
+}
+
+
+
+
+define i64 @test_xchg_i64() {
+entry:
+  %0  = atomicrmw xchg  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_xchg_i64:
+  ; CHECK: ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_add_i64() {
+entry:
+  %0  = atomicrmw add  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_add_i64:
+  ; CHECK: ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_sub_i64() {
+entry:
+  %0  = atomicrmw sub  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_sub_i64:
+  ; CHECK: ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_and_i64() {
+entry:
+  %0  = atomicrmw and  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_and_i64:
+  ; CHECK: ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_nand_i64() {
+entry:
+  %0  = atomicrmw nand  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_nand_i64:
+  ; CHECK: ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_or_i64() {
+entry:
+  %0  = atomicrmw or  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_or_i64:
+  ; CHECK: ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_xor_i64() {
+entry:
+  %0  = atomicrmw xor  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_xor_i64:
+  ; CHECK: ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_max_i64() {
+entry:
+  %0  = atomicrmw max  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_max_i64:
+  ; CHECK: ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_min_i64() {
+entry:
+  %0  = atomicrmw min  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_min_i64:
+  ; CHECK: ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_umax_i64() {
+entry:
+  %0  = atomicrmw umax  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_umax_i64:
+  ; CHECK: ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_umin_i64() {
+entry:
+  %0  = atomicrmw umin  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_umin_i64:
+  ; CHECK: ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define double @test_fadd_double() {
+entry:
+  %0  = atomicrmw fadd  double* @atomic_double, double 1.0 monotonic
+  ; CHECK-LABEL: test_fadd_double:
+  ; CHECK: ldaxr   [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stlxr   {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret double %0
+}
+define double @test_fsub_double() {
+entry:
+  %0  = atomicrmw fsub  double* @atomic_double, double 1.0 monotonic
+  ; CHECK-LABEL: test_fsub_double:
+  ; CHECK: ldaxr   [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; CHECK: stlxr   {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret double %0
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir b/llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir
--- a/llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir
+++ b/llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir
@@ -11,10 +11,10 @@
 body: |
   bb.0:
     ; GCN-LABEL: name: fast_regalloc_bundle_handling
-    ; GCN: renamable $vgpr0 = IMPLICIT_DEF
     ; GCN: renamable $vgpr1 = IMPLICIT_DEF
-    ; GCN: renamable $vgpr0 = BUNDLE implicit killed renamable $vgpr0, implicit killed renamable $vgpr1, implicit $exec {
-    ; GCN:   renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+    ; GCN: renamable $vgpr2 = IMPLICIT_DEF
+    ; GCN: renamable $vgpr0 = BUNDLE implicit killed renamable $vgpr1, implicit killed renamable $vgpr2, implicit $exec {
+    ; GCN:   renamable $vgpr0 = V_ADD_U32_e32 $vgpr1, $vgpr2, implicit $exec
     ; GCN: }
     ; GCN: S_ENDPGM 0, implicit killed renamable $vgpr0
     %0 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor.ll b/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor.ll
@@ -0,0 +1,381 @@
+; RUN: llc -O0 -o - %s | FileCheck %s --check-prefix=CHECK
+target triple = "armv7-none-eabi"
+
+@atomic_i8 = external global i8
+@atomic_i16 = external global i16
+@atomic_i32 = external global i32
+@atomic_i64 = external global i64
+
+@atomic_half = external global half
+@atomic_float = external global float
+@atomic_double = external global double
+
+
+define i8 @test_xchg_i8() {
+entry:
+  %0  = atomicrmw xchg  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_xchg_i8:
+  ; CHECK: ldrexb  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexb  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_add_i8() {
+entry:
+  %0  = atomicrmw add  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_add_i8:
+  ; CHECK: ldrexb  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexb  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_sub_i8() {
+entry:
+  %0  = atomicrmw sub  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_sub_i8:
+  ; CHECK: ldrexb  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexb  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_and_i8() {
+entry:
+  %0  = atomicrmw and  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_and_i8:
+  ; CHECK: ldrexb  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexb  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_nand_i8() {
+entry:
+  %0  = atomicrmw nand  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_nand_i8:
+  ; CHECK: ldrexb  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexb  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_or_i8() {
+entry:
+  %0  = atomicrmw or  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_or_i8:
+  ; CHECK: ldrexb  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexb  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_xor_i8() {
+entry:
+  %0  = atomicrmw xor  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_xor_i8:
+  ; CHECK: ldrexb  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexb  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_max_i8() {
+entry:
+  %0  = atomicrmw max  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_max_i8:
+  ; CHECK: ldrexb  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexb  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_min_i8() {
+entry:
+  %0  = atomicrmw min  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_min_i8:
+  ; CHECK: ldrexb  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexb  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_umax_i8() {
+entry:
+  %0  = atomicrmw umax  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_umax_i8:
+  ; CHECK: ldrexb  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexb  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_umin_i8() {
+entry:
+  %0  = atomicrmw umin  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK-LABEL: test_umin_i8:
+  ; CHECK: ldrexb  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexb  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i8 %0
+}
+
+
+define i16 @test_xchg_i16() {
+entry:
+  %0  = atomicrmw xchg  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_xchg_i16:
+  ; CHECK: ldrexh  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexh  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_add_i16() {
+entry:
+  %0  = atomicrmw add  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_add_i16:
+  ; CHECK: ldrexh  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexh  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_sub_i16() {
+entry:
+  %0  = atomicrmw sub  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_sub_i16:
+  ; CHECK: ldrexh  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexh  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_and_i16() {
+entry:
+  %0  = atomicrmw and  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_and_i16:
+  ; CHECK: ldrexh  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexh  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_nand_i16() {
+entry:
+  %0  = atomicrmw nand  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_nand_i16:
+  ; CHECK: ldrexh  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexh  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_or_i16() {
+entry:
+  %0  = atomicrmw or  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_or_i16:
+  ; CHECK: ldrexh  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexh  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_xor_i16() {
+entry:
+  %0  = atomicrmw xor  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_xor_i16:
+  ; CHECK: ldrexh  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexh  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_max_i16() {
+entry:
+  %0  = atomicrmw max  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_max_i16:
+  ; CHECK: ldrexh  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexh  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_min_i16() {
+entry:
+  %0  = atomicrmw min  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_min_i16:
+  ; CHECK: ldrexh  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexh  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_umax_i16() {
+entry:
+  %0  = atomicrmw umax  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_umax_i16:
+  ; CHECK: ldrexh  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexh  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_umin_i16() {
+entry:
+  %0  = atomicrmw umin  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK-LABEL: test_umin_i16:
+  ; CHECK: ldrexh  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexh  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i16 %0
+}
+
+
+define i32 @test_xchg_i32() {
+entry:
+  %0  = atomicrmw xchg  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_xchg_i32:
+  ; CHECK: ldrex  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strex  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_add_i32() {
+entry:
+  %0  = atomicrmw add  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_add_i32:
+  ; CHECK: ldrex  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strex  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_sub_i32() {
+entry:
+  %0  = atomicrmw sub  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_sub_i32:
+  ; CHECK: ldrex  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strex  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_and_i32() {
+entry:
+  %0  = atomicrmw and  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_and_i32:
+  ; CHECK: ldrex  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strex  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_nand_i32() {
+entry:
+  %0  = atomicrmw nand  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_nand_i32:
+  ; CHECK: ldrex  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strex  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_or_i32() {
+entry:
+  %0  = atomicrmw or  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_or_i32:
+  ; CHECK: ldrex  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strex  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_xor_i32() {
+entry:
+  %0  = atomicrmw xor  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_xor_i32:
+  ; CHECK: ldrex  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strex  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_max_i32() {
+entry:
+  %0  = atomicrmw max  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_max_i32:
+  ; CHECK: ldrex  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strex  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_min_i32() {
+entry:
+  %0  = atomicrmw min  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_min_i32:
+  ; CHECK: ldrex  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strex  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_umax_i32() {
+entry:
+  %0  = atomicrmw umax  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_umax_i32:
+  ; CHECK: ldrex  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strex  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_umin_i32() {
+entry:
+  %0  = atomicrmw umin  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK-LABEL: test_umin_i32:
+  ; CHECK: ldrex  {{r[0-9]+|lr}}, [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strex  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i32 %0
+}
+
+
+
+
+define i64 @test_xchg_i64() {
+entry:
+  %0  = atomicrmw xchg  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_xchg_i64:
+  ; CHECK: ldrexd  {{r[0-9]+|lr}}, [[RB:r[0-9]+]], [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexd  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_add_i64() {
+entry:
+  %0  = atomicrmw add  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_add_i64:
+  ; CHECK: ldrexd  {{r[0-9]+|lr}}, [[RB:r[0-9]+]], [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexd  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_sub_i64() {
+entry:
+  %0  = atomicrmw sub  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_sub_i64:
+  ; CHECK: ldrexd  {{r[0-9]+|lr}}, [[RB:r[0-9]+]], [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexd  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_and_i64() {
+entry:
+  %0  = atomicrmw and  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_and_i64:
+  ; CHECK: ldrexd  {{r[0-9]+|lr}}, [[RB:r[0-9]+]], [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexd  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_nand_i64() {
+entry:
+  %0  = atomicrmw nand  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_nand_i64:
+  ; CHECK: ldrexd  {{r[0-9]+|lr}}, [[RB:r[0-9]+]], [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexd  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_or_i64() {
+entry:
+  %0  = atomicrmw or  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_or_i64:
+  ; CHECK: ldrexd  {{r[0-9]+|lr}}, [[RB:r[0-9]+]], [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexd  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_xor_i64() {
+entry:
+  %0  = atomicrmw xor  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK-LABEL: test_xor_i64:
+  ; CHECK: ldrexd  {{r[0-9]+|lr}}, [[RB:r[0-9]+]], [[ADDR:.(r[0-9]+|lr).]]
+  ; CHECK-NOT: str
+  ; CHECK: strexd  {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, {{r[0-9]+|lr}}, [[ADDR]]
+  ret i64 %0
+}