Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -215,6 +215,9 @@ def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128", "IsPaired128Slow", "true", "Paired 128 bit loads and stores are slow">; +def FeatureAscendStoreAddress : SubtargetFeature<"ascend-store-address", + "IsStoreAddressAscend", "false", "schedule to ascend the address of stores">; + def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "IsSTRQroSlow", "true", "STR of Q register with register offset is slow">; Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -103,6 +103,8 @@ /// Returns whether the instruction is a pre-indexed load/store. static bool isPreLdSt(const MachineInstr &MI); + static bool isPairedLdSt(const MachineInstr &MI); + /// Returns whether the instruction is FP or NEON. static bool isFpOrNEON(const MachineInstr &MI); @@ -492,6 +494,8 @@ /// Return opcode to be used for indirect calls. unsigned getBLRCallOpcode(const MachineFunction &MF); +const MachineOperand &getLdStBaseOp(const MachineInstr &MI); +const MachineOperand &getLdStOffsetOp(const MachineInstr &MI); // struct TSFlags { #define TSFLAG_ELEMENT_SIZE_TYPE(X) (X) // 3-bits Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3152,6 +3152,26 @@ return isPreLd(MI) || isPreSt(MI); } +bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + case AArch64::STGPi: + return true; + } +} + static const TargetRegisterClass *getRegClass(const MachineInstr &MI, Register Reg) { if (MI.getParent() == nullptr) @@ -8009,6 +8029,22 @@ return AArch64::BLR; } +namespace llvm { +const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 + : 1; + return MI.getOperand(Idx); +} + +const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 + : 2; + return MI.getOperand(Idx); +} +} // namespace llvm + #define GET_INSTRINFO_HELPERS #define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" Index: llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -556,26 +556,6 @@ } } -static bool isPairedLdSt(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - return false; - case AArch64::LDPSi: - case AArch64::LDPSWi: - case AArch64::LDPDi: - case AArch64::LDPQi: - case AArch64::LDPWi: - case AArch64::LDPXi: - case AArch64::STPSi: - case AArch64::STPDi: - case AArch64::STPQi: - case AArch64::STPWi: - case AArch64::STPXi: - case AArch64::STGPi: - return true; - } -} - static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) { unsigned OpcA = FirstMI.getOpcode(); @@ -610,7 +590,7 @@ // Returns the scale and offset range of pre/post indexed variants of MI. static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, int &MinOffset, int &MaxOffset) { - bool IsPaired = isPairedLdSt(MI); + bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI); bool IsTagStore = isTagStore(MI); // ST*G and all paired ldst have the same scale in pre/post-indexed variants // as in the "unsigned offset" variant. @@ -632,17 +612,8 @@ bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI); if (IsPreLdSt) PairedRegOp += 1; - unsigned Idx = isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; - return MI.getOperand(Idx); -} - -static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { - unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 : 1; - return MI.getOperand(Idx); -} - -static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { - unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 : 2; + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; return MI.getOperand(Idx); } @@ -1823,7 +1794,7 @@ MachineInstrBuilder MIB; int Scale, MinOffset, MaxOffset; getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); - if (!isPairedLdSt(*I)) { + if (!AArch64InstrInfo::isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .add(getLdStRegOp(*Update)) @@ -1944,7 +1915,7 @@ // behavior in this case unlike normal stores, and always performs writeback // after reading the source register value. if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { - bool IsPairedInsn = isPairedLdSt(MemMI); + bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { Register DestReg = getLdStRegOp(MemMI, i).getReg(); if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) @@ -2015,7 +1986,7 @@ // If the base register overlaps a destination register, we can't // merge the update. if (!isTagStore(MemMI)) { - bool IsPairedInsn = isPairedLdSt(MemMI); + bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { Register DestReg = getLdStRegOp(MemMI, i).getReg(); if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) Index: llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp +++ llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp @@ -7,10 +7,57 @@ //===----------------------------------------------------------------------===// #include "AArch64MachineScheduler.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" using namespace llvm; +static bool needReorderStoreMI(const MachineInstr *MI) { + if (!MI) + return false; + + switch (MI->getOpcode()) { + default: + return false; + case AArch64::STURQi: + case AArch64::STRQui: + if (MI->getMF()->getSubtarget().isStoreAddressAscend()) + return false; + LLVM_FALLTHROUGH; + case AArch64::STPQi: + return getLdStOffsetOp(*MI).getType() == MachineOperand::MO_Immediate; + } + + return false; +} + +// Return true if two stores with same base address may overlap writes +static bool mayOverlapWrite(const MachineInstr &MI0, const MachineInstr &MI1, + int64_t &Off0, int64_t &Off1) { + const MachineOperand &Base0 = getLdStBaseOp(MI0); + const MachineOperand &Base1 = getLdStBaseOp(MI1); + + // May overlapping writes if two store instructions without same base + if (!Base0.isIdenticalTo(Base1)) + return true; + + int StoreSize0 = AArch64InstrInfo::getMemScale(MI0); + int StoreSize1 = AArch64InstrInfo::getMemScale(MI1); + Off0 = AArch64InstrInfo::hasUnscaledLdStOffset(MI0.getOpcode()) + ? getLdStOffsetOp(MI0).getImm() + : getLdStOffsetOp(MI0).getImm() * StoreSize0; + Off1 = AArch64InstrInfo::hasUnscaledLdStOffset(MI1.getOpcode()) + ? getLdStOffsetOp(MI1).getImm() + : getLdStOffsetOp(MI1).getImm() * StoreSize1; + + const MachineInstr &MI = (Off0 < Off1) ? MI0 : MI1; + int Multiples = AArch64InstrInfo::isPairedLdSt(MI) ? 2 : 1; + int StoreSize = AArch64InstrInfo::getMemScale(MI) * Multiples; + + return llabs(Off0 - Off1) < StoreSize; +} + bool AArch64PostRASchedStrategy::tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) { bool OriginalResult = PostGenericScheduler::tryCandidate(Cand, TryCand); @@ -18,20 +65,16 @@ if (Cand.isValid()) { MachineInstr *Instr0 = TryCand.SU->getInstr(); MachineInstr *Instr1 = Cand.SU->getInstr(); - // When dealing with two STPqi's. - if (Instr0 && Instr1 && Instr0->getOpcode() == Instr1->getOpcode () && - Instr0->getOpcode() == AArch64::STPQi) - { - MachineOperand &Base0 = Instr0->getOperand(2); - MachineOperand &Base1 = Instr1->getOperand(2); - int64_t Off0 = Instr0->getOperand(3).getImm(); - int64_t Off1 = Instr1->getOperand(3).getImm(); - // With the same base address and non-overlapping writes. - if (Base0.isIdenticalTo(Base1) && llabs (Off0 - Off1) >= 2) { - TryCand.Reason = NodeOrder; - // Order them by ascending offsets. - return Off0 < Off1; - } + + if (!needReorderStoreMI(Instr0) || !needReorderStoreMI(Instr1)) + return OriginalResult; + + int64_t Off0, Off1; + // With the same base address and non-overlapping writes. + if (!mayOverlapWrite(*Instr0, *Instr1, Off0, Off1)) { + TryCand.Reason = NodeOrder; + // Order them by ascending offsets. + return Off0 < Off1; } } Index: llvm/test/CodeGen/AArch64/aarch64-sched-store.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-sched-store.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64 -mcpu=cortex-a57 -mattr=+slow-paired-128 < %s | FileCheck %s --check-prefixes=DEFAULT +; RUN: llc -mtriple aarch64 -mcpu=cortex-a57 -mattr=+slow-paired-128 -mattr=+ascend-store-address < %s | FileCheck %s --check-prefixes=ASCEND + +target triple = "aarch64-unknown-linux-gnu" + +define dso_local void @memset_unroll2(double* nocapture %array, i64 %size) { +; DEFAULT-LABEL: memset_unroll2: +; DEFAULT: // %bb.0: // %entry +; DEFAULT-NEXT: fmov v0.2d, #2.00000000 +; DEFAULT-NEXT: add x8, x0, #64 +; DEFAULT-NEXT: .p2align 4, 0x0, 8 +; DEFAULT-NEXT: .LBB0_1: // %vector.body +; DEFAULT-NEXT: // =>This Inner Loop Header: Depth=1 +; DEFAULT-NEXT: stur q0, [x8, #-64] +; DEFAULT-NEXT: subs x1, x1, #4 +; DEFAULT-NEXT: stur q0, [x8, #-48] +; DEFAULT-NEXT: str q0, [x8] +; DEFAULT-NEXT: str q0, [x8, #16] +; DEFAULT-NEXT: str q0, [x8, #32] +; DEFAULT-NEXT: str q0, [x8, #48] +; DEFAULT-NEXT: stur q0, [x8, #-32] +; DEFAULT-NEXT: stur q0, [x8, #-16] +; DEFAULT-NEXT: add x8, x8, #128 +; DEFAULT-NEXT: b.ne .LBB0_1 +; DEFAULT-NEXT: // %bb.2: // %cleanup +; DEFAULT-NEXT: ret +; +; ASCEND-LABEL: memset_unroll2: +; ASCEND: // %bb.0: // %entry +; ASCEND-NEXT: fmov v0.2d, #2.00000000 +; ASCEND-NEXT: add x8, x0, #64 +; ASCEND-NEXT: .p2align 4, 0x0, 8 +; ASCEND-NEXT: .LBB0_1: // %vector.body +; ASCEND-NEXT: // =>This Inner Loop Header: Depth=1 +; ASCEND-NEXT: stur q0, [x8, #-64] +; ASCEND-NEXT: subs x1, x1, #4 +; ASCEND-NEXT: stur q0, [x8, #-48] +; ASCEND-NEXT: stur q0, [x8, #-32] +; ASCEND-NEXT: stur q0, [x8, #-16] +; ASCEND-NEXT: str q0, [x8] +; ASCEND-NEXT: str q0, [x8, #16] +; ASCEND-NEXT: str q0, [x8, #32] +; ASCEND-NEXT: str q0, [x8, #48] +; ASCEND-NEXT: add x8, x8, #128 +; ASCEND-NEXT: b.ne .LBB0_1 +; ASCEND-NEXT: // %bb.2: // %cleanup +; ASCEND-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index16, %vector.body ] + %niter = phi i64 [ %size, %entry ], [ %niter.nsub.3, %vector.body ] + %array0 = getelementptr inbounds double, double* %array, i64 %index + %array0.cast = bitcast double* %array0 to <2 x double>* + store <2 x double> , <2 x double>* %array0.cast, align 8 + %array2 = getelementptr inbounds double, double* %array0, i64 2 + %array2.cast = bitcast double* %array2 to <2 x double>* + store <2 x double> , <2 x double>* %array2.cast, align 8 + %index4 = or i64 %index, 4 + %array4 = getelementptr inbounds double, double* %array, i64 %index4 + %array4.cast = bitcast double* %array4 to <2 x double>* + store <2 x double> , <2 x double>* %array4.cast, align 8 + %array6 = getelementptr inbounds double, double* %array4, i64 2 + %array6.cast = bitcast double* %array6 to <2 x double>* + store <2 x double> , <2 x double>* %array6.cast, align 8 + %index8 = or i64 %index, 8 + %array8 = getelementptr inbounds double, double* %array, i64 %index8 + %array8.cast = bitcast double* %array8 to <2 x double>* + store <2 x double> , <2 x double>* %array8.cast, align 8 + %array10 = getelementptr inbounds double, double* %array8, i64 2 + %array10.cast = bitcast double* %array10 to <2 x double>* + store <2 x double> , <2 x double>* %array10.cast, align 8 + %index12 = or i64 %index, 12 + %array12 = getelementptr inbounds double, double* %array, i64 %index12 + %array12.cast = bitcast double* %array12 to <2 x double>* + store <2 x double> , <2 x double>* %array12.cast, align 8 + %array14 = getelementptr inbounds double, double* %array12, i64 2 + %array14.cast = bitcast double* %array14 to <2 x double>* + store <2 x double> , <2 x double>* %array14.cast, align 8 + %index16 = add i64 %index, 16 + %niter.nsub.3 = add i64 %niter, -4 + %niter.ncmp.3 = icmp eq i64 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %cleanup, label %vector.body + +cleanup: ; preds = %vector.body + ret void +}