Index: llvm/lib/Target/AArch64/AArch64.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64.td
+++ llvm/lib/Target/AArch64/AArch64.td
@@ -219,6 +219,10 @@
     "IsStoreAddressAscend", "false",
     "Schedule vector stores by ascending address">;
 
+def FeatureSchedLoadPrefer : SubtargetFeature<"sched-load-prefer",
+    "IsSchedLoadPerfer", "true",
+    "Schedule load instructions preferentially after register allocation">;
+
 def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "IsSTRQroSlow",
     "true", "STR of Q register with register offset is slow">;
 
Index: llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
+++ llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
@@ -13,6 +13,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64-scheduler"
+
 static bool needReorderStoreMI(const MachineInstr *MI) {
   if (!MI)
     return false;
@@ -66,15 +68,26 @@
     MachineInstr *Instr0 = TryCand.SU->getInstr();
     MachineInstr *Instr1 = Cand.SU->getInstr();
 
-    if (!needReorderStoreMI(Instr0) || !needReorderStoreMI(Instr1))
-      return OriginalResult;
+    LLVM_DEBUG(dbgs() << "    Cand: " << *Instr1 << "    TryCand: " << *Instr0);
+
+    if (needReorderStoreMI(Instr0) && needReorderStoreMI(Instr1)) {
+      int64_t Off0, Off1;
+      // With the same base address and non-overlapping writes.
+      if (!mayOverlapWrite(*Instr0, *Instr1, Off0, Off1)) {
+        TryCand.Reason = NodeOrder;
+        // Order them by ascending offsets.
+        return Off0 < Off1;
+      }
+    }
 
-    int64_t Off0, Off1;
-    // With the same base address and non-overlapping writes.
-    if (!mayOverlapWrite(*Instr0, *Instr1, Off0, Off1)) {
-      TryCand.Reason = NodeOrder;
-      // Order them by ascending offsets.
-      return Off0 < Off1;
+    // Try to issue the load instruction preferentially.
+    if (Instr0->getMF()->getSubtarget<AArch64Subtarget>().isSchedLoadPerfer()) {
+      if (Instr0->mayLoad() && !Instr1->mayLoad()) {
+        TryCand.Reason = NodeOrder;
+        return true;
+      } else if (!Instr0->mayLoad() && Instr1->mayLoad()) {
+        return false;
+      }
     }
   }
 
Index: llvm/test/CodeGen/AArch64/aarch64-sched-load.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/aarch64-sched-load.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64 -mcpu=cortex-a57 < %s | FileCheck %s --check-prefixes=DEFAULT
+; RUN: llc -mtriple aarch64 -mcpu=cortex-a57 -mattr=+sched-load-prefer < %s | FileCheck %s --check-prefixes=LOAD
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @hypre_SeqVectorAxpy(double %alpha, double* nocapture readonly %x, double* nocapture %y, i64 %count) {
+; DEFAULT-LABEL: hypre_SeqVectorAxpy:
+; DEFAULT:       // %bb.0: // %entry
+; DEFAULT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; DEFAULT-NEXT:    dup v0.2d, v0.d[0]
+; DEFAULT-NEXT:    add x8, x1, #16
+; DEFAULT-NEXT:    add x9, x0, #16
+; DEFAULT-NEXT:    .p2align 4, 0x0, 8
+; DEFAULT-NEXT:  .LBB0_1: // %vector.body
+; DEFAULT-NEXT:    // =>This Inner Loop Header: Depth=1
+; DEFAULT-NEXT:    ldp q1, q2, [x9, #-16]
+; DEFAULT-NEXT:    subs x2, x2, #4
+; DEFAULT-NEXT:    add x9, x9, #32
+; DEFAULT-NEXT:    ldp q3, q4, [x8, #-16]
+; DEFAULT-NEXT:    fmla v4.2d, v0.2d, v2.2d
+; DEFAULT-NEXT:    fmla v3.2d, v0.2d, v1.2d
+; DEFAULT-NEXT:    stp q3, q4, [x8, #-16]
+; DEFAULT-NEXT:    add x8, x8, #32
+; DEFAULT-NEXT:    b.ne .LBB0_1
+; DEFAULT-NEXT:  // %bb.2: // %cleanup
+; DEFAULT-NEXT:    mov w0, wzr
+; DEFAULT-NEXT:    ret
+;
+; LOAD-LABEL: hypre_SeqVectorAxpy:
+; LOAD:       // %bb.0: // %entry
+; LOAD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; LOAD-NEXT:    dup v0.2d, v0.d[0]
+; LOAD-NEXT:    add x8, x1, #16
+; LOAD-NEXT:    add x9, x0, #16
+; LOAD-NEXT:    .p2align 4, 0x0, 8
+; LOAD-NEXT:  .LBB0_1: // %vector.body
+; LOAD-NEXT:    // =>This Inner Loop Header: Depth=1
+; LOAD-NEXT:    ldp q1, q2, [x9, #-16]
+; LOAD-NEXT:    ldp q3, q4, [x8, #-16]
+; LOAD-NEXT:    subs x2, x2, #4
+; LOAD-NEXT:    add x9, x9, #32
+; LOAD-NEXT:    fmla v4.2d, v0.2d, v2.2d
+; LOAD-NEXT:    fmla v3.2d, v0.2d, v1.2d
+; LOAD-NEXT:    stp q3, q4, [x8, #-16]
+; LOAD-NEXT:    add x8, x8, #32
+; LOAD-NEXT:    b.ne .LBB0_1
+; LOAD-NEXT:  // %bb.2: // %cleanup
+; LOAD-NEXT:    mov w0, wzr
+; LOAD-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <2 x double> poison, double %alpha, i32 0
+  %broadcast.splat = shufflevector <2 x double> %broadcast.splatinsert, <2 x double> poison, <2 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %add0 = add i64 %index, 0
+  %ptr0 = getelementptr inbounds double, double* %x, i64 %add0
+  %value0 = getelementptr inbounds double, double* %ptr0, i32 0
+  %vec0 = bitcast double* %value0 to <2 x double>*
+  %wide.load0 = load <2 x double>, <2 x double>* %vec0, align 8
+  %value2 = getelementptr inbounds double, double* %ptr0, i32 2
+  %vec2 = bitcast double* %value2 to <2 x double>*
+  %wide.load2 = load <2 x double>, <2 x double>* %vec2, align 8
+  %mul0 = fmul fast <2 x double> %wide.load0, %broadcast.splat
+  %mul2 = fmul fast <2 x double> %wide.load2, %broadcast.splat
+  %ptry0 = getelementptr inbounds double, double* %y, i64 %add0
+  %valuey0 = getelementptr inbounds double, double* %ptry0, i32 0
+  %vecy0 = bitcast double* %valuey0 to <2 x double>*
+  %wide.loady0 = load <2 x double>, <2 x double>* %vecy0, align 8
+  %valuey2 = getelementptr inbounds double, double* %ptry0, i32 2
+  %vecy2 = bitcast double* %valuey2 to <2 x double>*
+  %wide.loady2 = load <2 x double>, <2 x double>* %vecy2, align 8
+  %fadd0 = fadd fast <2 x double> %wide.loady0, %mul0
+  %fadd2 = fadd fast <2 x double> %wide.loady2, %mul2
+  %vecy0_new = bitcast double* %valuey0 to <2 x double>*
+  store <2 x double> %fadd0, <2 x double>* %vecy0_new, align 8
+  %vecy2_new = bitcast double* %valuey2 to <2 x double>*
+  store <2 x double> %fadd2, <2 x double>* %vecy2_new, align 8
+  %index.next = add nuw i64 %index, 4
+  %cmp = icmp eq i64 %index.next, %count
+  br i1 %cmp, label %cleanup, label %vector.body
+
+cleanup:                                 ; preds = %vector.body
+  ret i32 0
+}
+