diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1026,6 +1026,11 @@
     return false;
   }
 
+  /// On some targets, spilling between certain instructions can invalidate
+  /// semantics used for atomic operations. This function returns a suitable
+  /// point where the spill can be done.
+  virtual MachineBasicBlock::iterator findSpillBefore(MachineInstr &MI) const;
+
   //===--------------------------------------------------------------------===//
   /// Debug information queries.
 
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -883,8 +883,7 @@
   assert(PhysReg != 0 && "Register not assigned");
   if (LRI->Reloaded || LRI->LiveOut) {
     if (!MI.isImplicitDef()) {
-      MachineBasicBlock::iterator SpillBefore =
-          std::next((MachineBasicBlock::iterator)MI.getIterator());
+      auto SpillBefore = TRI->findSpillBefore(MI);
       LLVM_DEBUG(dbgs() << "Spill Reason: LO: " << LRI->LiveOut << " RL: "
                         << LRI->Reloaded << '\n');
       bool Kill = LRI->LastUse == nullptr;
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -76,6 +76,11 @@
   return true;
 }
 
+MachineBasicBlock::iterator
+TargetRegisterInfo::findSpillBefore(MachineInstr &MI) const {
+  return std::next((MachineBasicBlock::iterator)MI.getIterator());
+}
+
 void TargetRegisterInfo::markSuperRegs(BitVector &RegisterSet,
                                        MCRegister Reg) const {
   for (MCSuperRegIterator AI(Reg, this, true); AI.isValid(); ++AI)
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -135,6 +135,8 @@
                       unsigned DstSubReg, const TargetRegisterClass *NewRC,
                       LiveIntervals &LIS) const override;
 
+  MachineBasicBlock::iterator findSpillBefore(MachineInstr &MI) const override;
+
   void getOffsetOpcodes(const StackOffset &Offset,
                         SmallVectorImpl<uint64_t> &Ops) const override;
 };
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -781,3 +781,64 @@
     return false;
   return true;
 }
+
+/// Select an appropriate instruction to spill before. Normally this is the
+/// instruction right after the current one, but in the case of ldrex/strex
+/// pairs we need to avoid introducing stores between the exclusive
+/// instructions, otherwise we will invalidate them.
+MachineBasicBlock::iterator
+AArch64RegisterInfo::findSpillBefore(MachineInstr &MI) const {
+  auto IsExclusiveLoad = [](auto &MI) -> Optional<unsigned> {
+    switch (MI.getOpcode()) {
+    case AArch64::LDXPW:
+      return AArch64::STXPW;
+    case AArch64::LDXPX:
+      return AArch64::STXPX;
+    case AArch64::LDXRB:
+      return AArch64::STXRB;
+    case AArch64::LDXRH:
+      return AArch64::STXRH;
+    case AArch64::LDXRW:
+      return AArch64::STXRW;
+    case AArch64::LDXRX:
+      return AArch64::STXRX;
+    default:
+      return None;
+    }
+  };
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  const MachineBasicBlock::reverse_iterator RE = MBB.rend();
+  const MachineBasicBlock::iterator E = MBB.end();
+
+  // Default value, spill before the next instruction
+  auto SpillBefore = std::next((MachineBasicBlock::iterator)MI.getIterator());
+
+  // If we are already at the end of the block, spill here
+  if (SpillBefore == E)
+    return SpillBefore;
+
+  // See if we are between an exclusive load/store pair
+  const auto LoadInst =
+      std::find_if((MachineBasicBlock::reverse_iterator)MI.getReverseIterator(),
+                   RE, IsExclusiveLoad);
+
+  // If there is no exclusive load previously in this block, return the default
+  if (LoadInst == RE)
+    return SpillBefore;
+
+  // Find the corresponding exclusive store
+  const unsigned StoreOp = IsExclusiveLoad(*LoadInst).getValue();
+  while (SpillBefore != E && (*SpillBefore).getOpcode() != StoreOp) {
+    SpillBefore = std::next(SpillBefore);
+  }
+
+  // If we failed to find an exclusive store, return the default
+  if (SpillBefore == E)
+    SpillBefore = std::next((MachineBasicBlock::iterator)MI.getIterator());
+
+  // We want to spill after the store
+  SpillBefore = std::next(SpillBefore);
+
+  return SpillBefore;
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.h b/llvm/lib/Target/ARM/ARMRegisterInfo.h
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.h
@@ -23,6 +23,7 @@
   virtual void anchor();
 public:
   ARMRegisterInfo();
+  MachineBasicBlock::iterator findSpillBefore(MachineInstr &MI) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -16,3 +16,68 @@
 void ARMRegisterInfo::anchor() { }
 
 ARMRegisterInfo::ARMRegisterInfo() : ARMBaseRegisterInfo() {}
+
+/// Select an appropriate instruction to spill before. Normally this is the
+/// instruction right after the current one, but in the case of ldrex/strex
+/// pairs we need to avoid introducing stores between the exclusive
+/// instructions, otherwise we will invalidate them.
+MachineBasicBlock::iterator
+ARMRegisterInfo::findSpillBefore(MachineInstr &MI) const {
+  auto IsExclusiveLoad = [](const MachineInstr &MI) -> Optional<unsigned> {
+    switch (MI.getOpcode()) {
+    case ARM::t2LDREX:
+      return ARM::t2STREX;
+    case ARM::t2LDREXB:
+      return ARM::t2STREXB;
+    case ARM::t2LDREXD:
+      return ARM::t2STREXD;
+    case ARM::t2LDREXH:
+      return ARM::t2STREXH;
+    case ARM::LDREX:
+      return ARM::STREX;
+    case ARM::LDREXB:
+      return ARM::STREXB;
+    case ARM::LDREXD:
+      return ARM::STREXD;
+    case ARM::LDREXH:
+      return ARM::STREXH;
+    default:
+      return None;
+    }
+  };
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  const MachineBasicBlock::reverse_iterator RE = MBB.rend();
+  const MachineBasicBlock::iterator E = MBB.end();
+
+  // Default value, spill before the next instruction
+  auto SpillBefore = std::next((MachineBasicBlock::iterator)MI.getIterator());
+
+  // If we are already at the end of the block, spill here
+  if (SpillBefore == E)
+    return SpillBefore;
+
+  // See if we are between an exclusive load/store pair
+  const auto LoadInst =
+      std::find_if((MachineBasicBlock::reverse_iterator)MI.getReverseIterator(),
+                   RE, IsExclusiveLoad);
+
+  // If there is no exclusive load previously in this block, return the default
+  if (LoadInst == RE)
+    return SpillBefore;
+
+  // Find the corresponding exclusive store
+  const unsigned StoreOp = IsExclusiveLoad(*LoadInst).getValue();
+  while (SpillBefore != E && (*SpillBefore).getOpcode() != StoreOp) {
+    SpillBefore = std::next(SpillBefore);
+  }
+
+  // If we failed to find an exclusive store, return the default
+  if (SpillBefore == E)
+    SpillBefore = std::next((MachineBasicBlock::iterator)MI.getIterator());
+
+  // We want to spill after the store
+  SpillBefore = std::next(SpillBefore);
+
+  return SpillBefore;
+}
diff --git a/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor.ll b/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor.ll
@@ -0,0 +1,578 @@
+; RUN: llc -O0 -o - %s --mtriple=armv8-none-eabi   | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
+; RUN: llc -O0 -o - %s --mtriple=aarch64-none-eabi | FileCheck %s --check-prefix=CHECK --check-prefix=A64
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+@atomic_i8 = external global i8
+@atomic_i16 = external global i16
+@atomic_i32 = external global i32
+@atomic_i64 = external global i64
+
+@atomic_half = external global half
+@atomic_float = external global float
+@atomic_double = external global double
+
+
+define i8 @test_xchg_i8() {
+entry:
+  %0  = atomicrmw xchg  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK: test_xchg_i8:
+  ; ARM:   ldrexb  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrb   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexb  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrb   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_add_i8() {
+entry:
+  %0  = atomicrmw add  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK: test_add_i8:
+  ; ARM:   ldrexb  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrb   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexb  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrb   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_sub_i8() {
+entry:
+  %0  = atomicrmw sub  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK: test_sub_i8:
+  ; ARM:   ldrexb  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrb   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexb  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrb   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_and_i8() {
+entry:
+  %0  = atomicrmw and  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK: test_and_i8:
+  ; ARM:   ldrexb  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrb   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexb  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrb   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_nand_i8() {
+entry:
+  %0  = atomicrmw nand  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK: test_nand_i8:
+  ; ARM:   ldrexb  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrb   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexb  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrb   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_or_i8() {
+entry:
+  %0  = atomicrmw or  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK: test_or_i8:
+  ; ARM:   ldrexb  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrb   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexb  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrb   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_xor_i8() {
+entry:
+  %0  = atomicrmw xor  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK: test_xor_i8:
+  ; ARM:   ldrexb  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrb   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexb  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrb   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_max_i8() {
+entry:
+  %0  = atomicrmw max  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK: test_max_i8:
+  ; ARM:   ldrexb  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrb   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexb  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrb   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_min_i8() {
+entry:
+  %0  = atomicrmw min  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK: test_min_i8:
+  ; ARM:   ldrexb  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrb   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexb  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrb   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_umax_i8() {
+entry:
+  %0  = atomicrmw umax  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK: test_umax_i8:
+  ; ARM:   ldrexb  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrb   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexb  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrb   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+define i8 @test_umin_i8() {
+entry:
+  %0  = atomicrmw umin  i8* @atomic_i8, i8 1 monotonic
+  ; CHECK: test_umin_i8:
+  ; ARM:   ldrexb  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrb   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexb  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrb   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i8 %0
+}
+
+
+define i16 @test_xchg_i16() {
+entry:
+  %0  = atomicrmw xchg  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK: test_xchg_i16:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrh   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrh   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_add_i16() {
+entry:
+  %0  = atomicrmw add  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK: test_add_i16:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrh   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrh   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_sub_i16() {
+entry:
+  %0  = atomicrmw sub  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK: test_sub_i16:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrh   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrh   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_and_i16() {
+entry:
+  %0  = atomicrmw and  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK: test_and_i16:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrh   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrh   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_nand_i16() {
+entry:
+  %0  = atomicrmw nand  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK: test_nand_i16:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrh   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrh   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_or_i16() {
+entry:
+  %0  = atomicrmw or  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK: test_or_i16:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrh   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrh   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_xor_i16() {
+entry:
+  %0  = atomicrmw xor  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK: test_xor_i16:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrh   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrh   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_max_i16() {
+entry:
+  %0  = atomicrmw max  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK: test_max_i16:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrh   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrh   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_min_i16() {
+entry:
+  %0  = atomicrmw min  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK: test_min_i16:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrh   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrh   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_umax_i16() {
+entry:
+  %0  = atomicrmw umax  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK: test_umax_i16:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrh   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrh   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define i16 @test_umin_i16() {
+entry:
+  %0  = atomicrmw umin  i16* @atomic_i16, i16 1 monotonic
+  ; CHECK: test_umin_i16:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxrh   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxrh   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i16 %0
+}
+define half @test_fadd_half() {
+entry:
+  %0  = atomicrmw fadd  half* @atomic_half, half 1.0 monotonic
+  ; CHECK: test_fadd_half:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldaxrh  [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stlxrh  {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ; the strex ends up in the next machine basic block, so regs will be different but address the same
+  ret half %0
+}
+define half @test_fsub_half() {
+entry:
+  %0  = atomicrmw fsub  half* @atomic_half, half 1.0 monotonic
+  ; CHECK: test_fsub_half:
+  ; ARM:   ldrexh  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldaxrh  [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexh  {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stlxrh  {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ; the strex ends up in the next machine basic block, so regs will be different but address the same
+  ret half %0
+}
+
+
+define i32 @test_xchg_i32() {
+entry:
+  %0  = atomicrmw xchg  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK: test_xchg_i32:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strex  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_add_i32() {
+entry:
+  %0  = atomicrmw add  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK: test_add_i32:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strex  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_sub_i32() {
+entry:
+  %0  = atomicrmw sub  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK: test_sub_i32:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strex  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_and_i32() {
+entry:
+  %0  = atomicrmw and  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK: test_and_i32:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strex  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_nand_i32() {
+entry:
+  %0  = atomicrmw nand  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK: test_nand_i32:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strex  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_or_i32() {
+entry:
+  %0  = atomicrmw or  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK: test_or_i32:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strex  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_xor_i32() {
+entry:
+  %0  = atomicrmw xor  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK: test_xor_i32:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strex  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_max_i32() {
+entry:
+  %0  = atomicrmw max  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK: test_max_i32:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strex  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_min_i32() {
+entry:
+  %0  = atomicrmw min  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK: test_min_i32:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strex  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_umax_i32() {
+entry:
+  %0  = atomicrmw umax  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK: test_umax_i32:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strex  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define i32 @test_umin_i32() {
+entry:
+  %0  = atomicrmw umin  i32* @atomic_i32, i32 1 monotonic
+  ; CHECK: test_umin_i32:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr   [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strex  [[RA]], {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr   [[RA]], {{w[0-9]+}}, [[ADDR]]
+  ret i32 %0
+}
+define float @test_fadd_float() {
+entry:
+  %0  = atomicrmw fadd  float* @atomic_float, float 1.0 monotonic
+  ; CHECK: test_fadd_float:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldaxr  [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; the strex ends up in the next machine basic block, so regs will be different but address the same
+  ; ARM:   strex  {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stlxr  {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret float %0
+}
+define float @test_fsub_float() {
+entry:
+  %0  = atomicrmw fsub  float* @atomic_float, float 1.0 monotonic
+  ; CHECK: test_fsub_float:
+  ; ARM:   ldrex  [[RA:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldaxr  [[RA:w[0-9]+]], [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; the strex ends up in the next machine basic block, so regs will be different but address the same
+  ; ARM:   strex  {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stlxr  {{w[0-9]+}}, {{w[0-9]+}}, [[ADDR]]
+  ret float %0
+}
+
+
+
+
+define i64 @test_xchg_i64() {
+entry:
+  %0  = atomicrmw xchg  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK: test_xchg_i64:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_add_i64() {
+entry:
+  %0  = atomicrmw add  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK: test_add_i64:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_sub_i64() {
+entry:
+  %0  = atomicrmw sub  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK: test_sub_i64:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_and_i64() {
+entry:
+  %0  = atomicrmw and  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK: test_and_i64:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_nand_i64() {
+entry:
+  %0  = atomicrmw nand  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK: test_nand_i64:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_or_i64() {
+entry:
+  %0  = atomicrmw or  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK: test_or_i64:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_xor_i64() {
+entry:
+  %0  = atomicrmw xor  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK: test_xor_i64:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_max_i64() {
+entry:
+  %0  = atomicrmw max  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK: test_max_i64:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_min_i64() {
+entry:
+  %0  = atomicrmw min  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK: test_min_i64:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_umax_i64() {
+entry:
+  %0  = atomicrmw umax  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK: test_umax_i64:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define i64 @test_umin_i64() {
+entry:
+  %0  = atomicrmw umin  i64* @atomic_i64, i64 1 monotonic
+  ; CHECK: test_umin_i64:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldxr    [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stxr    {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret i64 %0
+}
+define double @test_fadd_double() {
+entry:
+  %0  = atomicrmw fadd  double* @atomic_double, double 1.0 monotonic
+  ; CHECK: test_fadd_double:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldaxr   [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; the strex ends up in the next machine basic block, so regs will be different but address the same
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stlxr   {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret double %0
+}
+define double @test_fsub_double() {
+entry:
+  %0  = atomicrmw fsub  double* @atomic_double, double 1.0 monotonic
+  ; CHECK: test_fsub_double:
+  ; ARM:   ldrexd  [[RA:r[0-9]+]], [[RB:r[0-9]+]], [[ADDR:.r[0-9]+.]]
+  ; A64:   ldaxr   [[RA:x[0-9]+]],                 [[ADDR:.x[0-9]+.]]
+  ; CHECK-NOT: str
+  ; the strex ends up in the next machine basic block, so regs will be different but address the same
+  ; ARM:   strexd  {{r[0-9]+}}, {{r[0-9]+}}, {{r[0-9]+}}, [[ADDR]]
+  ; A64:   stlxr   {{w[0-9]+}}, {{x[0-9]+}},              [[ADDR]]
+  ret double %0
+}