diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -260,8 +260,10 @@
   STG,
   STZG,
   ST2G,
-  STZ2G
+  STZ2G,
 
+  LDP,
+  STP
 };
 
 } // end namespace AArch64ISD
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -516,6 +516,10 @@
   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
 
+  // 128-bit loads and stores can be done without expanding
+  setOperationAction(ISD::LOAD, MVT::i128, Custom);
+  setOperationAction(ISD::STORE, MVT::i128, Custom);
+
   // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
   // This requires the Performance Monitors extension.
   if (Subtarget->hasPerfMon())
@@ -1364,6 +1368,8 @@
   case AArch64ISD::SST1_SXTW_SCALED:  return "AArch64ISD::SST1_SXTW_SCALED";
   case AArch64ISD::SST1_UXTW_SCALED:  return "AArch64ISD::SST1_UXTW_SCALED";
   case AArch64ISD::SST1_IMM:          return "AArch64ISD::SST1_IMM";
+  case AArch64ISD::LDP:               return "AArch64ISD::LDP";
+  case AArch64ISD::STP:               return "AArch64ISD::STP";
   }
   return nullptr;
 }
@@ -2988,7 +2994,7 @@
 
 // Custom lowering for any store, vector or scalar and/or default or with
 // a truncate operations.  Currently only custom lower truncate operation
-// from vector v4i16 to v4i8.
+// from vector v4i16 to v4i8 or volatile stores of i128.
 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
                                           SelectionDAG &DAG) const {
   SDLoc Dl(Op);
@@ -3000,18 +3006,32 @@
   EVT VT = Value.getValueType();
   EVT MemVT = StoreNode->getMemoryVT();
 
-  assert (VT.isVector() && "Can only custom lower vector store types");
-
-  unsigned AS = StoreNode->getAddressSpace();
-  unsigned Align = StoreNode->getAlignment();
-  if (Align < MemVT.getStoreSize() &&
-      !allowsMisalignedMemoryAccesses(
-          MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
-    return scalarizeVectorStore(StoreNode, DAG);
-  }
-
-  if (StoreNode->isTruncatingStore()) {
-    return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
+  if (VT.isVector()) {
+    unsigned AS = StoreNode->getAddressSpace();
+    unsigned Align = StoreNode->getAlignment();
+    if (Align < MemVT.getStoreSize() &&
+        !allowsMisalignedMemoryAccesses(MemVT, AS, Align,
+                                        StoreNode->getMemOperand()->getFlags(),
+                                        nullptr)) {
+      return scalarizeVectorStore(StoreNode, DAG);
+    }
+
+    if (StoreNode->isTruncatingStore()) {
+      return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
+    }
+  } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
+    assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
+    SDValue Lo =
+        DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
+                    DAG.getConstant(0, Dl, MVT::i64));
+    SDValue Hi =
+        DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
+                    DAG.getConstant(1, Dl, MVT::i64));
+    SDValue Result = DAG.getMemIntrinsicNode(
+        AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
+        {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
+        StoreNode->getMemoryVT(), StoreNode->getMemOperand());
+    return Result;
   }
 
   return SDValue();
@@ -12689,6 +12709,27 @@
   case ISD::ATOMIC_CMP_SWAP:
     ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
     return;
+  case ISD::LOAD: {
+    assert(SDValue(N, 0).getValueType() == MVT::i128 &&
+           "unexpected load's value type");
+    LoadSDNode *LoadNode = cast<LoadSDNode>(N);
+    if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
+      // Non-volatile loads are optimized later in AArch64's load/store
+      // optimizer.
+      return;
+    }
+
+    SDValue Result = DAG.getMemIntrinsicNode(
+        AArch64ISD::LDP, SDLoc(N),
+        DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
+        {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
+        LoadNode->getMemOperand());
+
+    SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
+                               Result.getValue(0), Result.getValue(1));
+    Results.append({Pair, Result.getValue(2) /* Chain */});
+    return;
+  }
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -243,6 +243,9 @@
 def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
                                                  SDTCisPtrTy<1>]>;
 
+def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+
 // Generates the general dynamic sequences, i.e.
 //  adrp  x0, :tlsdesc:var
 //  ldr   x1, [x0, #:tlsdesc_lo12:var]
@@ -535,6 +538,9 @@
 def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>;
 def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
 
+def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
@@ -1987,6 +1993,9 @@
 defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64Op, simm7s8, "ldnp">;
 defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128Op, simm7s16, "ldnp">;
 
+def : Pat<(AArch64ldp (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
+          (LDPXi GPR64sp:$Rn, simm7s8:$offset)>;
+
 //---
 // (register offset)
 //---
@@ -2680,6 +2689,9 @@
 defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">;
 defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
 
+def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
+          (STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;
+
 //---
 // (Register offset)
 
diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll b/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll
--- a/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll
+++ b/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll
@@ -87,10 +87,8 @@
 define {i128, i1} @test_cmpxchg_128_unsplit(i128* %addr) {
 ; CHECK-LABEL: test_cmpxchg_128_unsplit:
 ; CHECK:     add x[[VAR128:[0-9]+]], {{x[0-9]+}}, :lo12:var128
-; CHECK:     ldr [[DESIRED_HI:x[0-9]+]], [x[[VAR128]], #8]
-; CHECK:     ldr [[DESIRED_LO:x[0-9]+]], [x[[VAR128]]]
-; CHECK:     ldr [[NEW_HI:x[0-9]+]], [x[[VAR128]], #8]
-; CHECK:     ldr [[NEW_LO:x[0-9]+]], [x[[VAR128]]]
+; CHECK:     ldp [[DESIRED_LO:x[0-9]+]], [[DESIRED_HI:x[0-9]+]], [x[[VAR128]]]
+; CHECK:     ldp [[NEW_LO:x[0-9]+]], [[NEW_HI:x[0-9]+]], [x[[VAR128]]]
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
 ; CHECK:     ldaxp [[OLD_LO:x[0-9]+]], [[OLD_HI:x[0-9]+]], [x0]
 ; CHECK:     cmp [[OLD_LO]], [[DESIRED_LO]]
diff --git a/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll b/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll
@@ -0,0 +1,117 @@
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
+
+@x = common dso_local global i128 0
+@y = common dso_local global i128 0
+
+define void @test1() {
+; CHECK-LABEL: test1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, x
+; CHECK-NEXT:    add x8, x8, :lo12:x
+; CHECK-NEXT:    ldp x8, x9, [x8]
+; CHECK-NEXT:    adrp x10, y
+; CHECK-NEXT:    add x10, x10, :lo12:y
+; CHECK-NEXT:    stp x8, x9, [x10]
+; CHECK-NEXT:    ret
+  %tmp = load volatile i128, i128* @x
+  store volatile i128 %tmp, i128* @y
+  ret void
+}
+
+define void @test2() {
+; CHECK-LABEL: test2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, x
+; CHECK-NEXT:    add x8, x8, :lo12:x
+; CHECK-NEXT:    ldp x8, x9, [x8, #504]
+; CHECK-NEXT:    adrp x10, y
+; CHECK-NEXT:    add x10, x10, :lo12:y
+; CHECK-NEXT:    stp x8, x9, [x10, #504]
+; CHECK-NEXT:    ret
+  %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 504) to i128*)
+  store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 504) to i128*)
+  ret void
+}
+
+define void @test3() {
+; CHECK-LABEL: test3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, x
+; CHECK-NEXT:    add x8, x8, :lo12:x
+; CHECK-NEXT:    add x8, x8, #512 // =512
+; CHECK-NEXT:    ldp x8, x9, [x8]
+; CHECK-NEXT:    adrp x10, y
+; CHECK-NEXT:    add x10, x10, :lo12:y
+; CHECK-NEXT:    add x10, x10, #512 // =512
+; CHECK-NEXT:    stp x8, x9, [x10]
+; CHECK-NEXT:    ret
+  %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 512) to i128*)
+  store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 512) to i128*)
+  ret void
+}
+
+define void @test4() {
+; CHECK-LABEL: test4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, x
+; CHECK-NEXT:    add x8, x8, :lo12:x
+; CHECK-NEXT:    ldp x8, x9, [x8, #-512]
+; CHECK-NEXT:    adrp x10, y
+; CHECK-NEXT:    add x10, x10, :lo12:y
+; CHECK-NEXT:    stp x8, x9, [x10, #-512]
+; CHECK-NEXT:    ret
+  %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 -512) to i128*)
+  store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 -512) to i128*)
+  ret void
+}
+
+define void @test5() {
+; CHECK-LABEL: test5:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, x
+; CHECK-NEXT:    add x8, x8, :lo12:x
+; CHECK-NEXT:    sub x8, x8, #520 // =520
+; CHECK-NEXT:    ldp x8, x9, [x8]
+; CHECK-NEXT:    adrp x10, y
+; CHECK-NEXT:    add x10, x10, :lo12:y
+; CHECK-NEXT:    sub x10, x10, #520 // =520
+; CHECK-NEXT:    stp x8, x9, [x10]
+; CHECK-NEXT:    ret
+  %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 -520) to i128*)
+  store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 -520) to i128*)
+  ret void
+}
+
+define void @test6() {
+; CHECK-LABEL: test6:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, x
+; CHECK-NEXT:    add x8, x8, :lo12:x
+; CHECK-NEXT:    sub x8, x8, #520 // =520
+; CHECK-NEXT:    ldp x8, x9, [x8]
+; CHECK-NEXT:    adrp x10, y
+; CHECK-NEXT:    add x10, x10, :lo12:y
+; CHECK-NEXT:    sub x10, x10, #520 // =520
+; CHECK-NEXT:    stp x8, x9, [x10]
+; CHECK-NEXT:    ret
+  %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 -520) to i128*)
+  store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 -520) to i128*)
+  ret void
+}
+
+define void @test7() {
+; CHECK-LABEL: test7:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, x
+; CHECK-NEXT:    add x8, x8, :lo12:x
+; CHECK-NEXT:    add x8, x8, #503 // =503
+; CHECK-NEXT:    ldp x8, x9, [x8]
+; CHECK-NEXT:    adrp x10, y
+; CHECK-NEXT:    add x10, x10, :lo12:y
+; CHECK-NEXT:    add x10, x10, #503 // =503
+; CHECK-NEXT:    stp x8, x9, [x10]
+; CHECK-NEXT:    ret
+  %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 503) to i128*)
+  store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 503) to i128*)
+  ret void
+}