Index: llvm/lib/Target/AArch64/AArch64.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64.td
+++ llvm/lib/Target/AArch64/AArch64.td
@@ -91,9 +91,6 @@
 def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
     "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
 
-def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128",
-    "Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">;
-
 def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "STRQroIsSlow",
     "true", "STR of Q register with register offset is slow">;
 
@@ -294,8 +291,7 @@
 
 def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                     "Samsung Exynos-M1 processors",
-                                    [FeatureSlowPaired128,
-                                     FeatureCRC,
+                                    [FeatureCRC,
                                      FeatureCrypto,
                                      FeatureCustomCheapAsMoveHandling,
                                      FeatureFPARMv8,
@@ -309,8 +305,7 @@
 
 def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                     "Samsung Exynos-M2/M3 processors",
-                                    [FeatureSlowPaired128,
-                                     FeatureCRC,
+                                    [FeatureCRC,
                                      FeatureCrypto,
                                      FeatureCustomCheapAsMoveHandling,
                                      FeatureFPARMv8,
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1894,19 +1894,6 @@
   if (isLdStPairSuppressed(MI))
     return false;
 
-  // On some CPUs quad load/store pairs are slower than two single load/stores.
-  if (Subtarget.isPaired128Slow()) {
-    switch (MI.getOpcode()) {
-    default:
-      break;
-    case AArch64::LDURQi:
-    case AArch64::STURQi:
-    case AArch64::LDRQui:
-    case AArch64::STRQui:
-      return false;
-    }
-  }
-
   return true;
 }
 
Index: llvm/lib/Target/AArch64/AArch64Subtarget.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -102,7 +102,6 @@
   bool CustomAsCheapAsMove = false;
   bool UsePostRAScheduler = false;
   bool Misaligned128StoreIsSlow = false;
-  bool Paired128IsSlow = false;
   bool STRQroIsSlow = false;
   bool UseAlternateSExtLoadCVTF32Pattern = false;
   bool HasArithmeticBccFusion = false;
@@ -220,7 +219,6 @@
   }
   bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
   bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
-  bool isPaired128Slow() const { return Paired128IsSlow; }
   bool isSTRQroSlow() const { return STRQroIsSlow; }
   bool useAlternateSExtLoadCVTF32Pattern() const {
     return UseAlternateSExtLoadCVTF32Pattern;
Index: llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck --check-prefix=EXYNOS %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefixes=CHECK,CORTEX
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1  -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefixes=CHECK,EXYNOS
 
 ; Test ldr clustering.
 ; CHECK: ********** MI Scheduling **********
@@ -8,11 +8,6 @@
 ; CHECK: Cluster ld/st SU(1) - SU(2)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
-; EXYNOS: ********** MI Scheduling **********
-; EXYNOS-LABEL: ldr_int:BB#0
-; EXYNOS: Cluster ld/st SU(1) - SU(2)
-; EXYNOS: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
-; EXYNOS: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
 define i32 @ldr_int(i32* %a) nounwind {
   %p1 = getelementptr inbounds i32, i32* %a, i32 1
   %tmp1 = load i32, i32* %p1, align 2
@@ -28,11 +23,6 @@
 ; CHECK: Cluster ld/st SU(1) - SU(2)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRSWui
-; EXYNOS: ********** MI Scheduling **********
-; EXYNOS-LABEL: ldp_sext_int:BB#0
-; EXYNOS: Cluster ld/st SU(1) - SU(2)
-; EXYNOS: SU(1):   %vreg{{[0-9]+}}<def> = LDRSWui
-; EXYNOS: SU(2):   %vreg{{[0-9]+}}<def> = LDRSWui
 define i64 @ldp_sext_int(i32* %p) nounwind {
   %tmp = load i32, i32* %p, align 4
   %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
@@ -49,11 +39,6 @@
 ; CHECK: Cluster ld/st SU(2) - SU(1)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDURWi
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDURWi
-; EXYNOS: ********** MI Scheduling **********
-; EXYNOS-LABEL: ldur_int:BB#0
-; EXYNOS: Cluster ld/st SU(2) - SU(1)
-; EXYNOS: SU(1):   %vreg{{[0-9]+}}<def> = LDURWi
-; EXYNOS: SU(2):   %vreg{{[0-9]+}}<def> = LDURWi
 define i32 @ldur_int(i32* %a) nounwind {
   %p1 = getelementptr inbounds i32, i32* %a, i32 -1
   %tmp1 = load i32, i32* %p1, align 2
@@ -69,11 +54,6 @@
 ; CHECK: Cluster ld/st SU(3) - SU(4)
 ; CHECK: SU(3):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; CHECK: SU(4):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
-; EXYNOS: ********** MI Scheduling **********
-; EXYNOS-LABEL: ldp_half_sext_zext_int:BB#0
-; EXYNOS: Cluster ld/st SU(3) - SU(4)
-; EXYNOS: SU(3):   %vreg{{[0-9]+}}<def> = LDRSWui
-; EXYNOS: SU(4):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
 define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind {
   %tmp0 = load i64, i64* %q, align 4
   %tmp = load i32, i32* %p, align 4
@@ -92,11 +72,6 @@
 ; CHECK: Cluster ld/st SU(3) - SU(4)
 ; CHECK: SU(3):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
 ; CHECK: SU(4):   %vreg{{[0-9]+}}<def> = LDRSWui
-; EXYNOS: ********** MI Scheduling **********
-; EXYNOS-LABEL: ldp_half_zext_sext_int:BB#0
-; EXYNOS: Cluster ld/st SU(3) - SU(4)
-; EXYNOS: SU(3):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
-; EXYNOS: SU(4):   %vreg{{[0-9]+}}<def> = LDRSWui
 define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind {
   %tmp0 = load i64, i64* %q, align 4
   %tmp = load i32, i32* %p, align 4
@@ -115,11 +90,6 @@
 ; CHECK-NOT: Cluster ld/st
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
-; EXYNOS: ********** MI Scheduling **********
-; EXYNOS-LABEL: ldr_int_volatile:BB#0
-; EXYNOS-NOT: Cluster ld/st
-; EXYNOS: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
-; EXYNOS: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
 define i32 @ldr_int_volatile(i32* %a) nounwind {
   %p1 = getelementptr inbounds i32, i32* %a, i32 1
   %tmp1 = load volatile i32, i32* %p1, align 2
@@ -129,15 +99,12 @@
   ret i32 %tmp3
 }
 
-; Test ldq clustering (no clustering for Exynos).
+; Test ldq clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldq_cluster:BB#0
 ; CHECK: Cluster ld/st SU(1) - SU(3)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRQui
 ; CHECK: SU(3):   %vreg{{[0-9]+}}<def> = LDRQui
-; EXYNOS: ********** MI Scheduling **********
-; EXYNOS-LABEL: ldq_cluster:BB#0
-; EXYNOS-NOT: Cluster ld/st
 define <2 x i64> @ldq_cluster(i64* %p) {
   %a1 = bitcast i64* %p to <2 x i64>*
   %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8
Index: llvm/test/CodeGen/AArch64/ldst-opt.ll
===================================================================
--- llvm/test/CodeGen/AArch64/ldst-opt.ll
+++ llvm/test/CodeGen/AArch64/ldst-opt.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs                      -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,NOSTRICTALIGN
 ; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mattr=+strict-align -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,STRICTALIGN
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mcpu=exynos-m1      -o - %s | FileCheck %s --check-prefixes=CHECK,EXYNOS
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mcpu=exynos-m1      -o - %s | FileCheck %s --check-prefixes=CHECK,EXYNOS,NOSTRICTALIGN
 
 ; This file contains tests for the AArch64 load/store optimizer.
 
@@ -1141,8 +1141,8 @@
 define void @store-pair-post-indexed-quadword() nounwind {
 ; CHECK-LABEL: store-pair-post-indexed-quadword
 ; GENERIC: stp q{{[0-9]+}}, q{{[0-9]+}}, [sp], #64
-; EXYNOS: str q{{[0-9]+}}, [sp]
-; EXYNOS-NEXT: str q{{[0-9]+}}, [sp, #16]
+; EXYNOS: stp q{{[0-9]+}}, q{{[0-9]+}}, [sp]
+; EXYNOS-NEXT: add sp, sp, #64
   %src = alloca { fp128, fp128 }, align 8
   %dst = alloca { fp128, fp128 }, align 8
 
@@ -1552,7 +1552,6 @@
 define void @merge_zr64(i64* %p) {
 ; CHECK-LABEL: merge_zr64:
 ; CHECK: stp xzr, xzr, [x{{[0-9]+}}]
-; CHECK-NEXT: ret
 entry:
   store i64 0, i64* %p
   %p1 = getelementptr i64, i64* %p, i64 1
@@ -1627,9 +1626,7 @@
 define void @merge_zr64_4vecd(<4 x double>* %p) {
 ; CHECK-LABEL: merge_zr64_4vecd:
 ; CHECK: movi v[[REG:[0-9]]].2d, #0000000000000000
-; GENERIC-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
-; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16]
-; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}]
+; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
 entry:
   store <4 x double> zeroinitializer, <4 x double>* %p
   ret void
Index: llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
===================================================================
--- llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+slow-paired-128 -verify-machineinstrs -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-eabi -mcpu=exynos-m1 -verify-machineinstrs -asm-verbose=false | FileCheck %s
-
-; CHECK-LABEL: test_nopair_st
-; CHECK: str
-; CHECK: stur
-; CHECK-NOT: stp
-define void @test_nopair_st(double* %ptr, <2 x double> %v1, <2 x double> %v2) {
-  %tmp1 = bitcast double* %ptr to <2 x double>*
-  store <2 x double> %v2, <2 x double>* %tmp1, align 16
-  %add.ptr = getelementptr inbounds double, double* %ptr, i64 -2
-  %tmp = bitcast double* %add.ptr to <2 x double>*
-  store <2 x double> %v1, <2 x double>* %tmp, align 16
-  ret void
-}
-
-; CHECK-LABEL: test_nopair_ld
-; CHECK: ldr
-; CHECK: ldr
-; CHECK-NOT: ldp
-define <2 x i64> @test_nopair_ld(i64* %p) {
-  %a1 = bitcast i64* %p to <2 x i64>*
-  %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8
-  %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 2
-  %a2 = bitcast i64* %add.ptr2 to <2 x i64>*
-  %tmp2 = load <2 x i64>, <2 x i64>* %a2, align 8
-  %add = add nsw <2 x i64> %tmp1, %tmp2
-  ret <2 x i64> %add
-}
Index: llvm/test/CodeGen/AArch64/quad-ldp-stp.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/quad-ldp-stp.ll
@@ -0,0 +1,72 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=generic    | FileCheck %s --check-prefixes=CHECK,GENERIC
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 | FileCheck %s --check-prefixes=CHECK,GENERIC
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m1  | FileCheck %s --check-prefixes=CHECK,EXYNOS
+
+; CHECK-LABEL: test_pre_pair_ld
+; CHECK: ldr q{{[0-9]+}}, [x0, #-32]!
+; CHECK: ldr q{{[0-9]+}}, [x0, #16]
+define <2 x i64>* @test_pre_pair_ld(<2 x i64>* %p, <2 x i64>* %q) {
+  %p1 = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i64 -1
+  %tmp0 = load <2 x i64>, <2 x i64>* %p1, align 16
+  %p2 = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i64 -2
+  %tmp1 = load <2 x i64>, <2 x i64>* %p2, align 16
+  %add = add nsw <2 x i64> %tmp0, %tmp1
+  store <2 x i64> %add, <2 x i64>* %q, align 16
+  ret <2 x i64>* %p2
+}
+
+; CHECK-LABEL: test_pair_ld
+; CHECK: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x{{[0-9]+}}]
+define <2 x i64> @test_pair_ld(i64* %p) {
+  %a1 = bitcast i64* %p to <2 x i64>*
+  %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8
+  %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 2
+  %a2 = bitcast i64* %add.ptr2 to <2 x i64>*
+  %tmp2 = load <2 x i64>, <2 x i64>* %a2, align 8
+  %add = add nsw <2 x i64> %tmp1, %tmp2
+  ret <2 x i64> %add
+}
+
+; CHECK-LABEL: test_post_pair_ld
+; GENERIC: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x0], #32
+; EXYNOS: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x0]{{$}}
+define <2 x i64>* @test_post_pair_ld(<2 x i64>* %p, <2 x i64>* %q) {
+  %tmp0 = load <2 x i64>, <2 x i64>* %p, align 16
+  %p1 = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i64 1
+  %tmp1 = load <2 x i64>, <2 x i64>* %p1, align 16
+  %add = add nsw <2 x i64> %tmp0, %tmp1
+  store <2 x i64> %add, <2 x i64>* %q, align 16
+  %p2 = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i64 2
+  ret <2 x i64>* %p2
+}
+
+; CHECK-LABEL: test_pre_pair_st
+; CHECK: str q{{[0-9]+}}, [x0, #-32]!
+; CHECK: str q{{[0-9]+}}, [x0, #16]
+define <2 x double>* @test_pre_pair_st(<2 x double>* %ptr, <2 x double> %v1, <2 x double> %v2) {
+  %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 -1
+  store <2 x double> %v1, <2 x double>* %ptr1, align 16
+  %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 -2
+  store <2 x double> %v2, <2 x double>* %ptr2, align 16
+  ret <2 x double>* %ptr2
+}
+
+; CHECK-LABEL: test_pair_st
+; CHECK: stp q{{[0-9]+}}, q{{[0-9]+}}, [x0, #-16]
+define void @test_pair_st(<2 x double> * %ptr, <2 x double> %v1, <2 x double> %v2) {
+  store <2 x double> %v2, <2 x double>* %ptr, align 16
+  %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 -1
+  store <2 x double> %v1, <2 x double>* %ptr1, align 16
+  ret void
+}
+
+; CHECK-LABEL: test_post_pair_st
+; GENERIC: stp q{{[0-9]+}}, q{{[0-9]+}}, [x0]
+; EXYNOS: stp q{{[0-9]+}}, q{{[0-9]+}}, [x0]{{$}}
+define <2 x double>* @test_post_pair_st(<2 x double>* %ptr, <2 x double> %v1, <2 x double> %v2) {
+  store <2 x double> %v1, <2 x double>* %ptr, align 16
+  %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 1
+  store <2 x double> %v2, <2 x double>* %ptr1, align 16
+  %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
+  ret <2 x double>* %ptr2
+}