diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1665,6 +1665,9 @@
   /// ordered or volatile memory references.
   bool hasOrderedMemoryRef() const;
 
+  /// Return true if this instruction has a volatile memory reference.
+  bool hasVolatileMemoryRef() const;
+
   /// Return true if this load instruction never traps and points to a memory
   /// location whose value doesn't change during the execution of this function.
   ///
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1405,6 +1405,14 @@
   });
 }
 
+/// Return true if this instruction has a volatile memory reference.
+bool MachineInstr::hasVolatileMemoryRef() const {
+  // Check if any of our memory operands are ordered.
+  return llvm::any_of(memoperands(), [](const MachineMemOperand *MMO) {
+    return MMO->isVolatile();
+  });
+}
+
 /// isDereferenceableInvariantLoad - Return true if this instruction will never
 /// trap and is loading from a location whose value is invariant across a run of
 /// this function.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -107,9 +107,21 @@
   /// Returns the base register operator of a load/store.
   static const MachineOperand &getLdStBaseOp(const MachineInstr &MI);
 
-  /// Returns the the immediate offset operator of a load/store.
+  /// Returns the immediate offset operator of a load/store.
   static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
 
+  /// Returns the offset directly when the instruction has no offset operand.
+  static int64_t getLdStOffset(const MachineInstr &MI);
+
+  /// Return whether this load/store has an offset operand.
+  static bool hasLdStOffsetOp(const MachineInstr &MI);
+
+  /// Return whether a pre/post indexed variant of this instruction exists.
+  /// Used by isCandidateToMergeOrPair, which otherwise uses the
+  /// existence/position of immediate operands to decide whether an op is
+  /// suitable for merging/pairing.
+  static bool hasRCPC3PrePostIndexVariant(const MachineInstr &MI);
+
   /// Returns whether the instruction is FP or NEON.
   static bool isFpOrNEON(const MachineInstr &MI);
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2210,6 +2210,14 @@
   case AArch64::LDURBBi:
   case AArch64::LDURSBWi:
   case AArch64::LDURSHWi:
+  case AArch64::LDIAPPWpre:
+  case AArch64::LDIAPPXpre:
+  case AArch64::STILPWpre:
+  case AArch64::STILPXpre:
+  case AArch64::LDAPRWpre:
+  case AArch64::LDAPRXpre:
+  case AArch64::STLRWpre:
+  case AArch64::STLRXpre:
     return true;
   }
 }
@@ -2392,6 +2400,10 @@
   case AArch64::LDRXui:
   case AArch64::LDRWui:
   case AArch64::LDRSWui:
+  case AArch64::LDAPRW:
+  case AArch64::LDAPRX:
+  case AArch64::STLRW:
+  case AArch64::STLRX:
   // Unscaled instructions.
   case AArch64::STURSi:
   case AArch64::STRSpre:
@@ -2515,9 +2527,20 @@
   bool IsPreLdSt = isPreLdSt(MI);
 
   // If this is a volatile load/store, don't mess with it.
-  if (MI.hasOrderedMemoryRef())
+  if (MI.hasVolatileMemoryRef() || MI.memoperands_empty())
     return false;
 
+  // The only ordered loads/stores that we consider here are those in RCPC3.
+  if (!hasRCPC3PrePostIndexVariant(MI)) {
+    for (const MachineMemOperand *MMO : MI.memoperands())
+      if (!MMO->isUnordered())
+        return false;
+  }
+
+  // FIXME temporary to see what this hits besides the instructions we're adding
+  if (MI.hasOrderedMemoryRef())
+    assert(hasRCPC3PrePostIndexVariant(MI));
+
   // Make sure this is a reg/fi+imm (as opposed to an address reloc).
   // For Pre-inc LD/ST, the operand is shifted by one.
   assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
@@ -2526,9 +2549,11 @@
 
   // For Pre-indexed addressing quadword instructions, the third operand is the
   // immediate value.
-  bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
+  bool IsImmPreLdSt =
+      IsPreLdSt && MI.getNumOperands() >= 4 && MI.getOperand(3).isImm();
+  bool IsTypicalCase = MI.getNumOperands() >= 3 && MI.getOperand(2).isImm();
 
-  if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
+  if (!IsTypicalCase && !IsImmPreLdSt && !hasRCPC3PrePostIndexVariant(MI))
     return false;
 
   // Can't merge/pair if the instruction modifies the base register.
@@ -3098,6 +3123,14 @@
   case AArch64::LDURSBWi:
   case AArch64::STRBBui:
   case AArch64::STURBBi:
+  case AArch64::LDIAPPW:
+  case AArch64::LDIAPPX:
+  case AArch64::STILPW:
+  case AArch64::STILPX:
+  case AArch64::LDAPRW:
+  case AArch64::LDAPRX:
+  case AArch64::STLRW:
+  case AArch64::STLRX:
     return 1;
   case AArch64::LDRHHui:
   case AArch64::LDURHHi:
@@ -3194,12 +3227,16 @@
   switch (MI.getOpcode()) {
   default:
     return false;
+  case AArch64::LDIAPPW:
+  case AArch64::LDIAPPX:
   case AArch64::LDPSi:
   case AArch64::LDPSWi:
   case AArch64::LDPDi:
   case AArch64::LDPQi:
   case AArch64::LDPWi:
   case AArch64::LDPXi:
+  case AArch64::STILPW:
+  case AArch64::STILPX:
   case AArch64::STPSi:
   case AArch64::STPDi:
   case AArch64::STPQi:
@@ -3217,14 +3254,44 @@
   return MI.getOperand(Idx);
 }
 
+bool AArch64InstrInfo::hasRCPC3PrePostIndexVariant(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  // Added by FEAT_RCPC3
+  case AArch64::LDIAPPW:
+  case AArch64::STILPW:
+  case AArch64::LDIAPPX:
+  case AArch64::STILPX:
+  // Pre-existing, but FEAT_LRCPC3 added pre/post indexed versions
+  case AArch64::LDAPRW:
+  case AArch64::LDAPRX:
+  case AArch64::STLRW:
+  case AArch64::STLRX:
+    return true;
+  }
+}
+
+bool AArch64InstrInfo::hasLdStOffsetOp(const MachineInstr &MI) {
+  // Currently only the FEAT_LRCPC3 instructions don't have an index operand.
+  return !hasRCPC3PrePostIndexVariant(MI);
+}
+
 const MachineOperand &
 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
+  assert(hasLdStOffsetOp(MI));
   unsigned Idx =
       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
                                                                             : 2;
   return MI.getOperand(Idx);
 }
 
+int64_t AArch64InstrInfo::getLdStOffset(const MachineInstr &MI) {
+  if (!hasLdStOffsetOp(MI))
+    return 0;
+  return AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
+}
+
 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
                                               Register Reg) {
   if (MI.getParent() == nullptr)
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -21,6 +21,7 @@
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -475,6 +476,14 @@
     return AArch64::STZ2GPreIndex;
   case AArch64::STGPi:
     return AArch64::STGPpre;
+  case AArch64::STILPW:
+    return AArch64::STILPWpre;
+  case AArch64::STILPX:
+    return AArch64::STILPXpre;
+  case AArch64::STLRW:
+    return AArch64::STLRWpre;
+  case AArch64::STLRX:
+    return AArch64::STLRXpre;
   }
 }
 
@@ -534,6 +543,18 @@
     return AArch64::LDPWpost;
   case AArch64::LDPXi:
     return AArch64::LDPXpost;
+  case AArch64::LDIAPPW:
+    return AArch64::LDIAPPWpre;
+  case AArch64::LDIAPPX:
+    return AArch64::LDIAPPXpre;
+  case AArch64::LDAPRW:
+    return AArch64::LDAPRWpre;
+  case AArch64::LDAPRX:
+    return AArch64::LDAPRXpre;
+  case AArch64::STLRW:
+    return AArch64::STLRWpre;
+  case AArch64::STLRX:
+    return AArch64::STLRXpre;
   case AArch64::STPSi:
     return AArch64::STPSpost;
   case AArch64::STPDi:
@@ -588,9 +609,53 @@
   }
 }
 
+static bool getRCPC3MemOpInfo(const MachineInstr &MI, int &Scale,
+                              int &MinOffset, int &MaxOffset) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDIAPPW:
+    Scale = 1;
+    MinOffset = MaxOffset = 8;
+    return true;
+  case AArch64::STILPW:
+    Scale = 1;
+    MinOffset = MaxOffset = -8;
+    return true;
+  case AArch64::LDIAPPX:
+    Scale = 1;
+    MinOffset = MaxOffset = 16;
+    return true;
+  case AArch64::STILPX:
+    Scale = 1;
+    MinOffset = MaxOffset = -16;
+    return true;
+  case AArch64::LDAPRW:
+    Scale = 1;
+    MinOffset = MaxOffset = 4;
+    return true;
+  case AArch64::STLRWpre:
+    Scale = 1;
+    MinOffset = MaxOffset = -4;
+    return true;
+  case AArch64::LDAPRX:
+    Scale = 1;
+    MinOffset = MaxOffset = 8;
+    return true;
+  case AArch64::STLRXpre:
+    Scale = 1;
+    MinOffset = MaxOffset = -8;
+    return true;
+  }
+}
+
 // Returns the scale and offset range of pre/post indexed variants of MI.
 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
                                        int &MinOffset, int &MaxOffset) {
+  // Special case for LDIAPP/STILP which accept only particular offsets
+  if (getRCPC3MemOpInfo(MI, Scale, MinOffset, MaxOffset))
+    return;
+
   bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
   bool IsTagStore = isTagStore(MI);
   // ST*G and all paired ldst have the same scale in pre/post-indexed variants
@@ -663,6 +728,8 @@
 
 static bool isMergeableLdStUpdate(MachineInstr &MI) {
   unsigned Opc = MI.getOpcode();
+  if (AArch64InstrInfo::hasRCPC3PrePostIndexVariant(MI))
+    return true;
   switch (Opc) {
   default:
     return false;
@@ -712,7 +779,6 @@
     // Make sure this is a reg+imm (as opposed to an address reloc).
     if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
       return false;
-
     return true;
   }
 }
@@ -1287,11 +1353,13 @@
                                        LdStPairFlags &Flags,
                                        const AArch64InstrInfo *TII) {
   // If this is volatile or if pairing is suppressed, not a candidate.
-  if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+  if (MI.hasVolatileMemoryRef() || MI.memoperands_empty())
+    return false;
+  if (TII->isLdStPairSuppressed(MI))
     return false;
 
   // We should have already checked FirstMI for pair suppression and volatility.
-  assert(!FirstMI.hasOrderedMemoryRef() &&
+  assert(!FirstMI.hasVolatileMemoryRef() &&
          !TII->isLdStPairSuppressed(FirstMI) &&
          "FirstMI shouldn't get here if either of these checks are true.");
 
@@ -1306,6 +1374,12 @@
   if (OpcA == OpcB)
     return !AArch64InstrInfo::isPreLdSt(FirstMI);
 
+  // For RCPC3, we only merge pairs with matching opcodes.
+  if (AArch64InstrInfo::hasRCPC3PrePostIndexVariant(FirstMI)) {
+    assert(OpcA != OpcB);
+    return false;
+  }
+
   // Try to match a sign-extended load/store with a zero-extended load/store.
   bool IsValidLdStrOpc, PairIsValidLdStrOpc;
   unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
@@ -1528,7 +1602,7 @@
   bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
   Register Reg = getLdStRegOp(FirstMI).getReg();
   Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();
-  int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();
+  int Offset = AArch64InstrInfo::getLdStOffset(FirstMI);
   int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
 
@@ -1563,6 +1637,7 @@
 
     Flags.setSExtIdx(-1);
     if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
+        AArch64InstrInfo::hasLdStOffsetOp(MI) &&
         AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
       assert(MI.mayLoadOrStore() && "Expected memory operation.");
       // If we've found another instruction with the same opcode, check to see
@@ -1572,7 +1647,7 @@
       // actually an immediate and not a symbolic reference destined for
       // a relocation.
       Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
-      int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
+      int MIOffset = AArch64InstrInfo::getLdStOffset(MI);
       bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
       if (IsUnscaled != MIIsUnscaled) {
         // We're trying to pair instructions that differ in how they are scaled.
@@ -1819,9 +1894,10 @@
               .add(getLdStRegOp(*Update))
               .add(getLdStRegOp(*I))
               .add(AArch64InstrInfo::getLdStBaseOp(*I))
-              .addImm(Value / Scale)
               .setMemRefs(I->memoperands())
               .setMIFlags(I->mergeFlagsWith(*Update));
+    if (AArch64InstrInfo::hasLdStOffsetOp(*I))
+      MIB.addImm(Value / Scale);
   } else {
     // Paired instruction.
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
@@ -1829,9 +1905,10 @@
               .add(getLdStRegOp(*I, 0))
               .add(getLdStRegOp(*I, 1))
               .add(AArch64InstrInfo::getLdStBaseOp(*I))
-              .addImm(Value / Scale)
               .setMemRefs(I->memoperands())
               .setMIFlags(I->mergeFlagsWith(*Update));
+    if (AArch64InstrInfo::hasLdStOffsetOp(*I))
+      MIB.addImm(Value / Scale);
   }
   if (CFI != E) {
     MachineBasicBlock *MBB = I->getParent();
@@ -1914,8 +1991,9 @@
   MachineBasicBlock::iterator MBBI = I;
 
   Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
-  int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *
-                         TII->getMemScale(MemMI);
+
+  int MIUnscaledOffset =
+      AArch64InstrInfo::getLdStOffset(MemMI) * TII->getMemScale(MemMI);
 
   // Scan forward looking for post-index opportunities.  Updating instructions
   // can't be formed if the memory instruction doesn't have the offset we're
@@ -1992,7 +2070,7 @@
   MachineFunction &MF = *MemMI.getMF();
 
   Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
-  int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
+  int Offset = AArch64InstrInfo::getLdStOffset(MemMI);
 
   // If the load/store is the first instruction in the block, there's obviously
   // not any matching update. Ditto if the memory offset isn't zero.
@@ -2128,7 +2206,7 @@
   // range, plus allow an extra one in case we find a later insn that matches
   // with Offset-1)
   bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
-  int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
+  int Offset = AArch64InstrInfo::getLdStOffset(MI);
   int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
   // Allow one more for offset.
   if (Offset > 0)
@@ -2196,7 +2274,7 @@
   // operation. The immediate in the add we're looking for,
   // however, is not, so adjust here.
   int UnscaledOffset =
-      AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
+      AArch64InstrInfo::getLdStOffset(MI) * TII->getMemScale(MI);
 
   // Look forward to try to find a pre-index instruction. For example,
   // ldr x1, [x0, #64]
diff --git a/llvm/test/CodeGen/AArch64/aarch64-rcpc3-ldst.ll b/llvm/test/CodeGen/AArch64/aarch64-rcpc3-ldst.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-rcpc3-ldst.ll
@@ -0,0 +1,396 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "^\s*(lda|ldia|stl|stil)"
+
+; RUN: llc %s -o - -mtriple=aarch64-none-linux-gnu -O1 -mattr=+v8.1a,+rcpc,+rcpc-immo,+rcpc3 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-NO-LSE2
+; RUN: llc %s -o - -mtriple=aarch64-none-linux-gnu -O1 -mattr=+v8.1a,+rcpc,+rcpc-immo,+rcpc3                                   | FileCheck %s --check-prefixes=SDAG-NO-LSE2
+; RUN: llc %s -o - -mtriple=aarch64-none-linux-gnu -O1 -mattr=+v8.1a,+lse2,+rcpc,+rcpc-immo,+rcpc3 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-LSE2
+; RUN: llc %s -o - -mtriple=aarch64-none-linux-gnu -O1 -mattr=+v8.1a,+lse2,+rcpc,+rcpc-immo,+rcpc3                                   | FileCheck %s --check-prefixes=SDAG-LSE2
+; RUN: llc %s -o - -mtriple=aarch64_be-none-linux-gnu -O1 -mattr=+v8.1a,+rcpc,+rcpc-immo,+rcpc3       | FileCheck %s --check-prefixes=SDAG-NO-LSE2
+; RUN: llc %s -o - -mtriple=aarch64_be-none-linux-gnu -O1 -mattr=+v8.1a,+lse2,+rcpc,+rcpc-immo,+rcpc3 | FileCheck %s --check-prefixes=SDAG-LSE2
+
+; TODO:
+; SelDAG implementation
+; merge two ordered load-aqcuires into a load-acquire pair
+; merging of pre/post indexed variants
+; LDIAPP only generated if we have LSE
+; LSE2 makes pair operations single-copy atomic for naturally aligned
+
+; Note:
+; Loading the pointer indirectly results in an update operation (add/sub)
+; which reuses the same src/dst register (x8).
+; Doing getelementptr on the input ptr directly results in `x8 = add blah`.
+
+; RCPC3 lets us merge two 32/64 bit atomic loads into a single one.
+define dso_local void @load_store_2xi32_rcpc3(ptr %ptr2ptr) {
+; GISEL-NO-LSE2-LABEL: load_store_2xi32_rcpc3:
+; GISEL-NO-LSE2:    ldapr w10, [x8]
+; GISEL-NO-LSE2:    ldapr w11, [x9]
+; GISEL-NO-LSE2:    stlr w10, [x8]
+; GISEL-NO-LSE2:    stlr w11, [x9]
+;
+; SDAG-NO-LSE2-LABEL: load_store_2xi32_rcpc3:
+; SDAG-NO-LSE2:    ldapr w10, [x8]
+; SDAG-NO-LSE2:    ldapr w11, [x9]
+; SDAG-NO-LSE2:    stlr w10, [x8]
+; SDAG-NO-LSE2:    stlr w11, [x9]
+;
+; GISEL-LSE2-LABEL: load_store_2xi32_rcpc3:
+; GISEL-LSE2:    ldapr w10, [x8]
+; GISEL-LSE2:    ldapr w11, [x9]
+; GISEL-LSE2:    stlr w10, [x8]
+; GISEL-LSE2:    stlr w11, [x9]
+;
+; SDAG-LSE2-LABEL: load_store_2xi32_rcpc3:
+; SDAG-LSE2:    ldapr w10, [x8]
+; SDAG-LSE2:    ldapr w11, [x9]
+; SDAG-LSE2:    stlr w10, [x8]
+; SDAG-LSE2:    stlr w11, [x9]
+    %ptr1 = load ptr, ptr %ptr2ptr
+    %ptr2 = getelementptr ptr, ptr %ptr1, i32 1
+
+    %a1 = load atomic i32, ptr %ptr1 acquire, align 8
+    %a2 = load atomic i32, ptr %ptr2 acquire, align 8
+
+    %b1 = add i32 %a1, %a1
+    %b2 = add i32 %a2, %a2
+
+    store atomic i32 %b1, ptr %ptr1 release, align 8
+    store atomic i32 %b2, ptr %ptr2 release, align 8
+
+    ret void
+}
+
+define dso_local void @load_store_2xi64_rcpc3(ptr %ptr2ptr) {
+; GISEL-NO-LSE2-LABEL: load_store_2xi64_rcpc3:
+; GISEL-NO-LSE2:    ldapr x10, [x8]
+; GISEL-NO-LSE2:    ldapr x11, [x9]
+; GISEL-NO-LSE2:    stlr x10, [x8]
+; GISEL-NO-LSE2:    stlr x11, [x9]
+;
+; SDAG-NO-LSE2-LABEL: load_store_2xi64_rcpc3:
+; SDAG-NO-LSE2:    ldapr x10, [x8]
+; SDAG-NO-LSE2:    ldapr x11, [x9]
+; SDAG-NO-LSE2:    stlr x10, [x8]
+; SDAG-NO-LSE2:    stlr x11, [x9]
+;
+; GISEL-LSE2-LABEL: load_store_2xi64_rcpc3:
+; GISEL-LSE2:    ldapr x10, [x8]
+; GISEL-LSE2:    ldapr x11, [x9]
+; GISEL-LSE2:    stlr x10, [x8]
+; GISEL-LSE2:    stlr x11, [x9]
+;
+; SDAG-LSE2-LABEL: load_store_2xi64_rcpc3:
+; SDAG-LSE2:    ldapr x10, [x8]
+; SDAG-LSE2:    ldapr x11, [x9]
+; SDAG-LSE2:    stlr x10, [x8]
+; SDAG-LSE2:    stlr x11, [x9]
+    %ptr1 = load ptr, ptr %ptr2ptr
+    %ptr2 = getelementptr ptr, ptr %ptr1, i64 1
+
+    %a1 = load atomic i64, ptr %ptr1 acquire, align 8
+    %a2 = load atomic i64, ptr %ptr2 acquire, align 8
+
+    %b1 = add i64 %a1, %a1
+    %b2 = add i64 %a2, %a2
+
+    store atomic i64 %b1, ptr %ptr1 release, align 8
+    store atomic i64 %b2, ptr %ptr2 release, align 8
+
+    ret void
+}
+
+; TODO Same again but with offsets
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Register Pair Ordered - Load Acquire RCpc / Store Release ;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; RCPC3 + LSE2 gives us 128-bit single-copy atomics.
+define dso_local void @load_atomic_i128_no_offset(ptr %ptr2ptr) {
+; GISEL-NO-LSE2-LABEL: load_atomic_i128_no_offset:
+; GISEL-NO-LSE2:    ldiapp xzr, x8, [x8]
+;
+; SDAG-NO-LSE2-LABEL: load_atomic_i128_no_offset:
+; SDAG-NO-LSE2:    ldiapp xzr, x8, [x8]
+;
+; GISEL-LSE2-LABEL: load_atomic_i128_no_offset:
+; GISEL-LSE2:    ldiapp xzr, x8, [x8]
+;
+; SDAG-LSE2-LABEL: load_atomic_i128_no_offset:
+; SDAG-LSE2:    ldiapp xzr, x8, [x8]
+    %ptr = load ptr, ptr %ptr2ptr
+    %a = load atomic i128, ptr %ptr acquire, align 16
+    ret void
+}
+
+define dso_local void @store_atomic_i128_no_offset(ptr %ptr2ptr) {
+; GISEL-NO-LSE2-LABEL: store_atomic_i128_no_offset:
+; GISEL-NO-LSE2:    stilp x8, xzr, [x9]
+;
+; SDAG-NO-LSE2-LABEL: store_atomic_i128_no_offset:
+; SDAG-NO-LSE2:    stilp x8, xzr, [x9]
+;
+; GISEL-LSE2-LABEL: store_atomic_i128_no_offset:
+; GISEL-LSE2:    stilp x8, xzr, [x9]
+;
+; SDAG-LSE2-LABEL: store_atomic_i128_no_offset:
+; SDAG-LSE2:    stilp x8, xzr, [x9]
+    %ptr = load ptr, ptr %ptr2ptr
+    store atomic i128 1, ptr %ptr release, align 16
+    ret void
+}
+
+; Same again with pre/post indexing.
+define dso_local void @load_atomic_i128_offset_16(ptr %base_ptr) {
+; GISEL-NO-LSE2-LABEL: load_atomic_i128_offset_16:
+; GISEL-NO-LSE2:    ldiapp x8, x9, [x0], #16
+;
+; SDAG-NO-LSE2-LABEL: load_atomic_i128_offset_16:
+; SDAG-NO-LSE2:    ldiapp x8, x9, [x0], #16
+;
+; GISEL-LSE2-LABEL: load_atomic_i128_offset_16:
+; GISEL-LSE2:    ldiapp x8, x9, [x0], #16
+;
+; SDAG-LSE2-LABEL: load_atomic_i128_offset_16:
+; SDAG-LSE2:    ldiapp x8, x9, [x0], #16
+entry:
+  br label %body
+
+body:
+  %ptr = phi ptr [ %offset_ptr, %body ], [ %base_ptr, %entry ]
+
+  ; Machine scheduler has a tendency to move the ADD before the LOAD.
+  ; Hence the loop, to ensure the ADD reuses one register for src and dst.
+  %val = load atomic i128, ptr %ptr acquire, align 16
+  %offset_ptr = getelementptr i64, ptr %ptr, i64 2
+
+  ; %val is used for %cond to ensure the LOAD is not removed.
+  %cond = icmp eq i128 %val, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define dso_local void @store_atomic_i128_offset_16(ptr %ptr2ptr) {
+; GISEL-NO-LSE2-LABEL: store_atomic_i128_offset_16:
+; GISEL-NO-LSE2:    stilp x8, xzr, [x9, #-16]!
+;
+; SDAG-NO-LSE2-LABEL: store_atomic_i128_offset_16:
+; SDAG-NO-LSE2:    stilp x8, xzr, [x9, #-16]!
+;
+; GISEL-LSE2-LABEL: store_atomic_i128_offset_16:
+; GISEL-LSE2:    stilp x8, xzr, [x9, #-16]!
+;
+; SDAG-LSE2-LABEL: store_atomic_i128_offset_16:
+; SDAG-LSE2:    stilp x8, xzr, [x9, #-16]!
+    %ptr = load ptr, ptr %ptr2ptr
+    %ptr_wb = getelementptr i64, ptr %ptr, i64 -2
+    store atomic i128 1, ptr %ptr_wb release, align 16
+    ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Single register - Load Acquire RCpc ;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; LDAPR <Xt>,[<Xn|SP>],#8 Load Acquire RCpc Register, post index(8) with writeback
+; LDAPR <Wt>,[<Xn|SP>],#4 Load Acquire RCpc Register, post index(4) with writeback
+
+define dso_local i64 @load_atomic_i64_no_offset(ptr %ptr2ptr) {
+; GISEL-NO-LSE2-LABEL: load_atomic_i64_no_offset:
+; GISEL-NO-LSE2:    ldapr x0, [x8]
+;
+; SDAG-NO-LSE2-LABEL: load_atomic_i64_no_offset:
+; SDAG-NO-LSE2:    ldapr x0, [x8]
+;
+; GISEL-LSE2-LABEL: load_atomic_i64_no_offset:
+; GISEL-LSE2:    ldapr x0, [x8]
+;
+; SDAG-LSE2-LABEL: load_atomic_i64_no_offset:
+; SDAG-LSE2:    ldapr x0, [x8]
+    %ptr = load ptr, ptr %ptr2ptr
+    %a = load atomic i64, ptr %ptr acquire, align 8
+    ret i64 %a
+}
+
+define dso_local i32 @load_atomic_i32_no_offset(ptr %ptr2ptr) {
+; GISEL-NO-LSE2-LABEL: load_atomic_i32_no_offset:
+; GISEL-NO-LSE2:    ldapr w0, [x8]
+;
+; SDAG-NO-LSE2-LABEL: load_atomic_i32_no_offset:
+; SDAG-NO-LSE2:    ldapr w0, [x8]
+;
+; GISEL-LSE2-LABEL: load_atomic_i32_no_offset:
+; GISEL-LSE2:    ldapr w0, [x8]
+;
+; SDAG-LSE2-LABEL: load_atomic_i32_no_offset:
+; SDAG-LSE2:    ldapr w0, [x8]
+    %ptr = load ptr, ptr %ptr2ptr
+    %a = load atomic i32, ptr %ptr acquire, align 4
+    ret i32 %a
+}
+
+define dso_local void @load_atomic_i64_offset_8(ptr %base_ptr) {
+; GISEL-NO-LSE2-LABEL: load_atomic_i64_offset_8:
+; GISEL-NO-LSE2:    ldapr x8, [x0], #8
+;
+; SDAG-NO-LSE2-LABEL: load_atomic_i64_offset_8:
+; SDAG-NO-LSE2:    ldapr x8, [x0], #8
+;
+; GISEL-LSE2-LABEL: load_atomic_i64_offset_8:
+; GISEL-LSE2:    ldapr x8, [x0], #8
+;
+; SDAG-LSE2-LABEL: load_atomic_i64_offset_8:
+; SDAG-LSE2:    ldapr x8, [x0], #8
+entry:
+  br label %body
+
+body:
+  %ptr = phi ptr [ %offset_ptr, %body ], [ %base_ptr, %entry ]
+
+  ; Machine scheduler has a tendency to move the ADD before the LOAD.
+  ; Hence the loop, to ensure the ADD reuses one register for src and dst.
+  %val = load atomic i64, ptr %ptr acquire, align 8
+  %offset_ptr = getelementptr i64, ptr %ptr, i64 1
+
+  ; %val is used for %cond to ensure the LOAD is not removed.
+  %cond = icmp eq i64 %val, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define dso_local void @load_atomic_i32_offset_4(ptr %base_ptr) {
+; GISEL-NO-LSE2-LABEL: load_atomic_i32_offset_4:
+; GISEL-NO-LSE2:    ldapr w8, [x0], #4
+;
+; SDAG-NO-LSE2-LABEL: load_atomic_i32_offset_4:
+; SDAG-NO-LSE2:    ldapr w8, [x0], #4
+;
+; GISEL-LSE2-LABEL: load_atomic_i32_offset_4:
+; GISEL-LSE2:    ldapr w8, [x0], #4
+;
+; SDAG-LSE2-LABEL: load_atomic_i32_offset_4:
+; SDAG-LSE2:    ldapr w8, [x0], #4
+entry:
+  br label %body
+
+body:
+  %ptr = phi ptr [ %offset_ptr, %body ], [ %base_ptr, %entry ]
+
+  ; Machine scheduler has a tendency to move the ADD before the LOAD.
+  ; Hence the loop, to ensure the ADD reuses one register for src and dst.
+  %val = load atomic i32, ptr %ptr acquire, align 4
+  %offset_ptr = getelementptr i32, ptr %ptr, i64 1
+
+  ; %val is used for %cond to ensure the LOAD is not removed.
+  %cond = icmp eq i32 %val, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Single register – Store Release ;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; STLR <Xt>,[<Xn|SP>, #-8]! Store Release Register, negative pre index(8) with writeback
+; STLR <Wt>,[<Xn|SP>, #-4]! Store Release Register, negative pre index(4) with writeback
+
+define dso_local void @store_atomic_i32_no_offset(ptr %ptr2ptr) {
+; GISEL-NO-LSE2-LABEL: store_atomic_i32_no_offset:
+; GISEL-NO-LSE2:    stlr w8, [x9]
+;
+; SDAG-NO-LSE2-LABEL: store_atomic_i32_no_offset:
+; SDAG-NO-LSE2:    stlr w8, [x9]
+;
+; GISEL-LSE2-LABEL: store_atomic_i32_no_offset:
+; GISEL-LSE2:    stlr w8, [x9]
+;
+; SDAG-LSE2-LABEL: store_atomic_i32_no_offset:
+; SDAG-LSE2:    stlr w8, [x9]
+    %ptr = load ptr, ptr %ptr2ptr
+    store atomic i32 1, ptr %ptr release, align 4
+    ret void
+}
+
+define dso_local void @store_atomic_i64_no_offset(ptr %ptr2ptr) {
+; GISEL-NO-LSE2-LABEL: store_atomic_i64_no_offset:
+; GISEL-NO-LSE2:    stlr x8, [x9]
+;
+; SDAG-NO-LSE2-LABEL: store_atomic_i64_no_offset:
+; SDAG-NO-LSE2:    stlr x8, [x9]
+;
+; GISEL-LSE2-LABEL: store_atomic_i64_no_offset:
+; GISEL-LSE2:    stlr x8, [x9]
+;
+; SDAG-LSE2-LABEL: store_atomic_i64_no_offset:
+; SDAG-LSE2:    stlr x8, [x9]
+    %ptr = load ptr, ptr %ptr2ptr
+    store atomic i64 1, ptr %ptr release, align 8
+    ret void
+}
+
+
+define dso_local void @store_atomic_i32_offset_4(ptr %ptr2ptr) {
+; GISEL-NO-LSE2-LABEL: store_atomic_i32_offset_4:
+; GISEL-NO-LSE2:    stlr w8, [x9, #-4]!
+;
+; SDAG-NO-LSE2-LABEL: store_atomic_i32_offset_4:
+; SDAG-NO-LSE2:    stlr w8, [x9, #-4]!
+;
+; GISEL-LSE2-LABEL: store_atomic_i32_offset_4:
+; GISEL-LSE2:    stlr w8, [x9, #-4]!
+;
+; SDAG-LSE2-LABEL: store_atomic_i32_offset_4:
+; SDAG-LSE2:    stlr w8, [x9, #-4]!
+    %ptr = load ptr, ptr %ptr2ptr
+    %ptr_wb = getelementptr i32, ptr %ptr, i64 -1
+    store atomic i32 1, ptr %ptr_wb release, align 4
+    ret void
+}
+
+define dso_local void @store_atomic_i64_offset_8(ptr %ptr2ptr) {
+; GISEL-NO-LSE2-LABEL: store_atomic_i64_offset_8:
+; GISEL-NO-LSE2:    stlr x8, [x9, #-8]!
+;
+; SDAG-NO-LSE2-LABEL: store_atomic_i64_offset_8:
+; SDAG-NO-LSE2:    stlr x8, [x9, #-8]!
+;
+; GISEL-LSE2-LABEL: store_atomic_i64_offset_8:
+; GISEL-LSE2:    stlr x8, [x9, #-8]!
+;
+; SDAG-LSE2-LABEL: store_atomic_i64_offset_8:
+; SDAG-LSE2:    stlr x8, [x9, #-8]!
+    %ptr = load ptr, ptr %ptr2ptr
+    %ptr_wb = getelementptr i64, ptr %ptr, i64 -1
+    store atomic i64 1, ptr %ptr_wb release, align 8
+    ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; 3.1.2 Additions to the Advanced SIMD and floating-point ISA - Register variant ;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; LDAPUR <Bt>, [<Xn|SP>{, #<simm>}] // Zeroing
+; LDAPUR <Ht>, [<Xn|SP>{, #<simm>}] // Zeroing
+; LDAPUR <St>, [<Xn|SP>{, #<simm>}] // Zeroing
+; LDAPUR <Dt>, [<Xn|SP>{, #<simm>}] // Zeroing
+; LDAPUR <Qt>, [<Xn|SP>{, #<simm>}]
+; STLUR <Bt>, [<Xn|SP>{, #<simm>}]
+; STLUR <Ht>, [<Xn|SP>{, #<simm>}]
+; STLUR <St>, [<Xn|SP>{, #<simm>}]
+; STLUR <Dt>, [<Xn|SP>{, #<simm>}]
+; STLUR <Qt>, [<Xn|SP>{, #<simm>}]
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; 3.1.2 Additions to the Advanced SIMD and floating-point ISA - Register Index variant ;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; LDAP1 {<Vt>.D }[index], [<Xn|SP>] // Merging
+; STL1 {<Vt>.D }[index], [<Xn|SP>]