Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -219,6 +219,10 @@ "IsStoreAddressAscend", "false", "Schedule vector stores by ascending address">; +def FeatureSchedLoadPrefer : SubtargetFeature<"sched-load-prefer", + "IsSchedLoadPerfer", "true", + "Schedule load instructions preferentially after register allocation">; + def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "IsSTRQroSlow", "true", "STR of Q register with register offset is slow">; Index: llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp +++ llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp @@ -13,6 +13,8 @@ using namespace llvm; +#define DEBUG_TYPE "aarch64-scheduler" + static bool needReorderStoreMI(const MachineInstr *MI) { if (!MI) return false; @@ -66,15 +68,26 @@ MachineInstr *Instr0 = TryCand.SU->getInstr(); MachineInstr *Instr1 = Cand.SU->getInstr(); - if (!needReorderStoreMI(Instr0) || !needReorderStoreMI(Instr1)) - return OriginalResult; + LLVM_DEBUG(dbgs() << " Cand: " << *Instr1 << " TryCand: " << *Instr0); + + if (needReorderStoreMI(Instr0) && needReorderStoreMI(Instr1)) { + int64_t Off0, Off1; + // With the same base address and non-overlapping writes. + if (!mayOverlapWrite(*Instr0, *Instr1, Off0, Off1)) { + TryCand.Reason = NodeOrder; + // Order them by ascending offsets. + return Off0 < Off1; + } + } - int64_t Off0, Off1; - // With the same base address and non-overlapping writes. - if (!mayOverlapWrite(*Instr0, *Instr1, Off0, Off1)) { - TryCand.Reason = NodeOrder; - // Order them by ascending offsets. - return Off0 < Off1; + // Try to issue the load instruction preferentially. + if (Instr0->getMF()->getSubtarget().isSchedLoadPerfer()) { + if (Instr0->mayLoad() && !Instr1->mayLoad()) { + TryCand.Reason = NodeOrder; + return true; + } else if (!Instr0->mayLoad() && Instr1->mayLoad()) { + return false; + } } } Index: llvm/test/CodeGen/AArch64/aarch64-sched-load.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-sched-load.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64 -mcpu=cortex-a57 < %s | FileCheck %s --check-prefixes=DEFAULT +; RUN: llc -mtriple aarch64 -mcpu=cortex-a57 -mattr=+sched-load-prefer < %s | FileCheck %s --check-prefixes=LOAD + +target triple = "aarch64-unknown-linux-gnu" + +define i32 @hypre_SeqVectorAxpy(double %alpha, double* nocapture readonly %x, double* nocapture %y, i64 %count) { +; DEFAULT-LABEL: hypre_SeqVectorAxpy: +; DEFAULT: // %bb.0: // %entry +; DEFAULT-NEXT: // kill: def $d0 killed $d0 def $q0 +; DEFAULT-NEXT: dup v0.2d, v0.d[0] +; DEFAULT-NEXT: add x8, x1, #16 +; DEFAULT-NEXT: add x9, x0, #16 +; DEFAULT-NEXT: .p2align 4, 0x0, 8 +; DEFAULT-NEXT: .LBB0_1: // %vector.body +; DEFAULT-NEXT: // =>This Inner Loop Header: Depth=1 +; DEFAULT-NEXT: ldp q1, q2, [x9, #-16] +; DEFAULT-NEXT: subs x2, x2, #4 +; DEFAULT-NEXT: add x9, x9, #32 +; DEFAULT-NEXT: ldp q3, q4, [x8, #-16] +; DEFAULT-NEXT: fmla v4.2d, v0.2d, v2.2d +; DEFAULT-NEXT: fmla v3.2d, v0.2d, v1.2d +; DEFAULT-NEXT: stp q3, q4, [x8, #-16] +; DEFAULT-NEXT: add x8, x8, #32 +; DEFAULT-NEXT: b.ne .LBB0_1 +; DEFAULT-NEXT: // %bb.2: // %cleanup +; DEFAULT-NEXT: mov w0, wzr +; DEFAULT-NEXT: ret +; +; LOAD-LABEL: hypre_SeqVectorAxpy: +; LOAD: // %bb.0: // %entry +; LOAD-NEXT: // kill: def $d0 killed $d0 def $q0 +; LOAD-NEXT: dup v0.2d, v0.d[0] +; LOAD-NEXT: add x8, x1, #16 +; LOAD-NEXT: add x9, x0, #16 +; LOAD-NEXT: .p2align 4, 0x0, 8 +; LOAD-NEXT: .LBB0_1: // %vector.body +; LOAD-NEXT: // =>This Inner Loop Header: Depth=1 +; LOAD-NEXT: ldp q1, q2, [x9, #-16] +; LOAD-NEXT: ldp q3, q4, [x8, #-16] +; LOAD-NEXT: subs x2, x2, #4 +; LOAD-NEXT: add x9, x9, #32 +; LOAD-NEXT: fmla v4.2d, v0.2d, v2.2d +; LOAD-NEXT: fmla v3.2d, v0.2d, v1.2d +; LOAD-NEXT: stp q3, q4, [x8, #-16] +; LOAD-NEXT: add x8, x8, #32 +; LOAD-NEXT: b.ne .LBB0_1 +; LOAD-NEXT: // %bb.2: // %cleanup +; LOAD-NEXT: mov w0, wzr +; LOAD-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <2 x double> poison, double %alpha, i32 0 + %broadcast.splat = shufflevector <2 x double> %broadcast.splatinsert, <2 x double> poison, <2 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %add0 = add i64 %index, 0 + %ptr0 = getelementptr inbounds double, double* %x, i64 %add0 + %value0 = getelementptr inbounds double, double* %ptr0, i32 0 + %vec0 = bitcast double* %value0 to <2 x double>* + %wide.load0 = load <2 x double>, <2 x double>* %vec0, align 8 + %value2 = getelementptr inbounds double, double* %ptr0, i32 2 + %vec2 = bitcast double* %value2 to <2 x double>* + %wide.load2 = load <2 x double>, <2 x double>* %vec2, align 8 + %mul0 = fmul fast <2 x double> %wide.load0, %broadcast.splat + %mul2 = fmul fast <2 x double> %wide.load2, %broadcast.splat + %ptry0 = getelementptr inbounds double, double* %y, i64 %add0 + %valuey0 = getelementptr inbounds double, double* %ptry0, i32 0 + %vecy0 = bitcast double* %valuey0 to <2 x double>* + %wide.loady0 = load <2 x double>, <2 x double>* %vecy0, align 8 + %valuey2 = getelementptr inbounds double, double* %ptry0, i32 2 + %vecy2 = bitcast double* %valuey2 to <2 x double>* + %wide.loady2 = load <2 x double>, <2 x double>* %vecy2, align 8 + %fadd0 = fadd fast <2 x double> %wide.loady0, %mul0 + %fadd2 = fadd fast <2 x double> %wide.loady2, %mul2 + %vecy0_new = bitcast double* %valuey0 to <2 x double>* + store <2 x double> %fadd0, <2 x double>* %vecy0_new, align 8 + %vecy2_new = bitcast double* %valuey2 to <2 x double>* + store <2 x double> %fadd2, <2 x double>* %vecy2_new, align 8 + %index.next = add nuw i64 %index, 4 + %cmp = icmp eq i64 %index.next, %count + br i1 %cmp, label %cleanup, label %vector.body + +cleanup: ; preds = %vector.body + ret i32 0 +} +