Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -215,6 +215,10 @@ def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128", "IsPaired128Slow", "true", "Paired 128 bit loads and stores are slow">; +def FeatureAscendStoreAddress : SubtargetFeature<"ascend-store-address", + "IsStoreAddressAscend", "false", + "Schedule scalar stores by ascending address">; + def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "IsSTRQroSlow", "true", "STR of Q register with register offset is slow">; Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -103,6 +103,15 @@ /// Returns whether the instruction is a pre-indexed load/store. static bool isPreLdSt(const MachineInstr &MI); + /// Returns whether the instruction is a paired load/store. + static bool isPairedLdSt(const MachineInstr &MI); + + /// Returns the base register operator of a load/store. + static const MachineOperand &getLdStBaseOp(const MachineInstr &MI); + + /// Returns the the immediate offset operator of a load/store. + static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI); + /// Returns whether the instruction is FP or NEON. static bool isFpOrNEON(const MachineInstr &MI); Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3152,6 +3152,41 @@ return isPreLd(MI) || isPreSt(MI); } +bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + case AArch64::STGPi: + return true; + } +} + +const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) { + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 + : 1; + return MI.getOperand(Idx); +} + +const MachineOperand & +AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 + : 2; + return MI.getOperand(Idx); +} + static const TargetRegisterClass *getRegClass(const MachineInstr &MI, Register Reg) { if (MI.getParent() == nullptr) Index: llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -556,26 +556,6 @@ } } -static bool isPairedLdSt(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - return false; - case AArch64::LDPSi: - case AArch64::LDPSWi: - case AArch64::LDPDi: - case AArch64::LDPQi: - case AArch64::LDPWi: - case AArch64::LDPXi: - case AArch64::STPSi: - case AArch64::STPDi: - case AArch64::STPQi: - case AArch64::STPWi: - case AArch64::STPXi: - case AArch64::STGPi: - return true; - } -} - static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) { unsigned OpcA = FirstMI.getOpcode(); @@ -610,7 +590,7 @@ // Returns the scale and offset range of pre/post indexed variants of MI. static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, int &MinOffset, int &MaxOffset) { - bool IsPaired = isPairedLdSt(MI); + bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI); bool IsTagStore = isTagStore(MI); // ST*G and all paired ldst have the same scale in pre/post-indexed variants // as in the "unsigned offset" variant. @@ -632,17 +612,8 @@ bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI); if (IsPreLdSt) PairedRegOp += 1; - unsigned Idx = isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; - return MI.getOperand(Idx); -} - -static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { - unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 : 1; - return MI.getOperand(Idx); -} - -static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { - unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 : 2; + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; return MI.getOperand(Idx); } @@ -652,12 +623,14 @@ assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); int LoadSize = TII->getMemScale(LoadInst); int StoreSize = TII->getMemScale(StoreInst); - int UnscaledStOffset = TII->hasUnscaledLdStOffset(StoreInst) - ? getLdStOffsetOp(StoreInst).getImm() - : getLdStOffsetOp(StoreInst).getImm() * StoreSize; - int UnscaledLdOffset = TII->hasUnscaledLdStOffset(LoadInst) - ? getLdStOffsetOp(LoadInst).getImm() - : getLdStOffsetOp(LoadInst).getImm() * LoadSize; + int UnscaledStOffset = + TII->hasUnscaledLdStOffset(StoreInst) + ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() + : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize; + int UnscaledLdOffset = + TII->hasUnscaledLdStOffset(LoadInst) + ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() + : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize; return (UnscaledStOffset <= UnscaledLdOffset) && (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); } @@ -736,7 +709,7 @@ case AArch64::STPWi: case AArch64::STPXi: // Make sure this is a reg+imm (as opposed to an address reloc). - if (!getLdStOffsetOp(MI).isImm()) + if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) return false; return true; @@ -770,17 +743,18 @@ // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. const MachineOperand &BaseRegOp = - MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I); + MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI) + : AArch64InstrInfo::getLdStBaseOp(*I); // Which register is Rt and which is Rt2 depends on the offset order. MachineInstr *RtMI; - if (getLdStOffsetOp(*I).getImm() == - getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) + if (AArch64InstrInfo::getLdStOffsetOp(*I).getImm() == + AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) RtMI = &*MergeMI; else RtMI = &*I; - int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); + int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm(); // Change the scaled offset from small to large type. if (IsScaled) { assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); @@ -944,10 +918,11 @@ // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. const MachineOperand &BaseRegOp = - MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I); + MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired) + : AArch64InstrInfo::getLdStBaseOp(*I); - int Offset = getLdStOffsetOp(*I).getImm(); - int PairedOffset = getLdStOffsetOp(*Paired).getImm(); + int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm(); + int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm(); bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode()); if (IsUnscaled != PairedIsUnscaled) { // We're trying to pair instructions that differ in how they are scaled. If @@ -982,7 +957,7 @@ RtMI = &*I; Rt2MI = &*Paired; } - int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); + int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm(); // Scale the immediate offset, if necessary. if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) { assert(!(OffsetImm % TII->getMemScale(*RtMI)) && @@ -1140,12 +1115,14 @@ assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) && "Unsupported ld/st match"); assert(LoadSize <= StoreSize && "Invalid load size"); - int UnscaledLdOffset = IsUnscaled - ? getLdStOffsetOp(*LoadI).getImm() - : getLdStOffsetOp(*LoadI).getImm() * LoadSize; - int UnscaledStOffset = IsUnscaled - ? getLdStOffsetOp(*StoreI).getImm() - : getLdStOffsetOp(*StoreI).getImm() * StoreSize; + int UnscaledLdOffset = + IsUnscaled + ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() + : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize; + int UnscaledStOffset = + IsUnscaled + ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() + : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize; int Width = LoadSize * 8; Register DestReg = IsStoreXReg ? Register(TRI->getMatchingSuperReg( @@ -1243,7 +1220,7 @@ MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator MBBI = I; MachineInstr &LoadMI = *I; - Register BaseReg = getLdStBaseOp(LoadMI).getReg(); + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg(); // If the load is the first instruction in the block, there's obviously // not any matching store. @@ -1272,7 +1249,8 @@ // Also we can't handle stores without an immediate offset operand, // while the operand might be the address for a global variable. if (MI.mayStore() && isMatchingStore(LoadMI, MI) && - BaseReg == getLdStBaseOp(MI).getReg() && getLdStOffsetOp(MI).isImm() && + BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() && + AArch64InstrInfo::getLdStOffsetOp(MI).isImm() && isLdOffsetInRangeOfSt(LoadMI, MI, TII) && ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) { StoreI = MBBI; @@ -1539,8 +1517,8 @@ bool MayLoad = FirstMI.mayLoad(); bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI); Register Reg = getLdStRegOp(FirstMI).getReg(); - Register BaseReg = getLdStBaseOp(FirstMI).getReg(); - int Offset = getLdStOffsetOp(FirstMI).getImm(); + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg(); + int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm(); int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1; bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); @@ -1575,7 +1553,7 @@ Flags.setSExtIdx(-1); if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && - getLdStOffsetOp(MI).isImm()) { + AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) { assert(MI.mayLoadOrStore() && "Expected memory operation."); // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. @@ -1583,8 +1561,8 @@ // check for +1/-1. Make sure to check the new instruction offset is // actually an immediate and not a symbolic reference destined for // a relocation. - Register MIBaseReg = getLdStBaseOp(MI).getReg(); - int MIOffset = getLdStOffsetOp(MI).getImm(); + Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg(); + int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI); if (IsUnscaled != MIIsUnscaled) { // We're trying to pair instructions that differ in how they are scaled. @@ -1615,15 +1593,16 @@ // can't be paired: bail and keep looking. if (IsPreLdSt) { bool IsOutOfBounds = MIOffset != TII->getMemScale(MI); - bool IsBaseRegUsed = - !UsedRegUnits.available(getLdStBaseOp(MI).getReg()); - bool IsBaseRegModified = - !ModifiedRegUnits.available(getLdStBaseOp(MI).getReg()); + bool IsBaseRegUsed = !UsedRegUnits.available( + AArch64InstrInfo::getLdStBaseOp(MI).getReg()); + bool IsBaseRegModified = !ModifiedRegUnits.available( + AArch64InstrInfo::getLdStBaseOp(MI).getReg()); // If the stored value and the address of the second instruction is // the same, it needs to be using the updated register and therefore // it must not be folded. - bool IsMIRegTheSame = TRI->regsOverlap(getLdStRegOp(MI).getReg(), - getLdStBaseOp(MI).getReg()); + bool IsMIRegTheSame = + TRI->regsOverlap(getLdStRegOp(MI).getReg(), + AArch64InstrInfo::getLdStBaseOp(MI).getReg()); if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified || IsMIRegTheSame) { LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, @@ -1776,7 +1755,7 @@ MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION || !(MI.getFlag(MachineInstr::FrameSetup) || MI.getFlag(MachineInstr::FrameDestroy)) || - getLdStBaseOp(MI).getReg() != AArch64::SP) + AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP) return End; const MachineFunction &MF = *MI.getParent()->getParent(); @@ -1823,12 +1802,12 @@ MachineInstrBuilder MIB; int Scale, MinOffset, MaxOffset; getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); - if (!isPairedLdSt(*I)) { + if (!AArch64InstrInfo::isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I)) - .add(getLdStBaseOp(*I)) + .add(AArch64InstrInfo::getLdStBaseOp(*I)) .addImm(Value / Scale) .setMemRefs(I->memoperands()) .setMIFlags(I->mergeFlagsWith(*Update)); @@ -1838,7 +1817,7 @@ .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I, 0)) .add(getLdStRegOp(*I, 1)) - .add(getLdStBaseOp(*I)) + .add(AArch64InstrInfo::getLdStBaseOp(*I)) .addImm(Value / Scale) .setMemRefs(I->memoperands()) .setMIFlags(I->mergeFlagsWith(*Update)); @@ -1928,8 +1907,9 @@ MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; - Register BaseReg = getLdStBaseOp(MemMI).getReg(); - int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * TII->getMemScale(MemMI); + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); + int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() * + TII->getMemScale(MemMI); // Scan forward looking for post-index opportunities. Updating instructions // can't be formed if the memory instruction doesn't have the offset we're @@ -1944,7 +1924,7 @@ // behavior in this case unlike normal stores, and always performs writeback // after reading the source register value. if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { - bool IsPairedInsn = isPairedLdSt(MemMI); + bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { Register DestReg = getLdStRegOp(MemMI, i).getReg(); if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) @@ -2005,8 +1985,8 @@ MachineBasicBlock::iterator MBBI = I; MachineFunction &MF = *MemMI.getMF(); - Register BaseReg = getLdStBaseOp(MemMI).getReg(); - int Offset = getLdStOffsetOp(MemMI).getImm(); + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); + int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm(); // If the load/store is the first instruction in the block, there's obviously // not any matching update. Ditto if the memory offset isn't zero. @@ -2015,7 +1995,7 @@ // If the base register overlaps a destination register, we can't // merge the update. if (!isTagStore(MemMI)) { - bool IsPairedInsn = isPairedLdSt(MemMI); + bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { Register DestReg = getLdStRegOp(MemMI, i).getReg(); if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) @@ -2085,7 +2065,7 @@ // Make sure this is a reg+imm. // FIXME: It is possible to extend it to handle reg+reg cases. - if (!getLdStOffsetOp(MI).isImm()) + if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) return false; // Look backward up to LdStLimit instructions. @@ -2139,7 +2119,7 @@ // range, plus allow an extra one in case we find a later insn that matches // with Offset-1) bool IsUnscaled = TII->hasUnscaledLdStOffset(MI); - int Offset = getLdStOffsetOp(MI).getImm(); + int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1; // Allow one more for offset. if (Offset > 0) @@ -2206,7 +2186,8 @@ // The immediate in the load/store is scaled by the size of the memory // operation. The immediate in the add we're looking for, // however, is not, so adjust here. - int UnscaledOffset = getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); + int UnscaledOffset = + AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); // Look forward to try to find a pre-index instruction. For example, // ldr x1, [x0, #64] Index: llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp +++ llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp @@ -7,10 +7,57 @@ //===----------------------------------------------------------------------===// #include "AArch64MachineScheduler.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" using namespace llvm; +static bool needReorderStoreMI(const MachineInstr *MI) { + if (!MI) + return false; + + switch (MI->getOpcode()) { + default: + return false; + case AArch64::STURQi: + case AArch64::STRQui: + if (MI->getMF()->getSubtarget().isStoreAddressAscend()) + return false; + LLVM_FALLTHROUGH; + case AArch64::STPQi: + return AArch64InstrInfo::getLdStOffsetOp(*MI).getType() == MachineOperand::MO_Immediate; + } + + return false; +} + +// Return true if two stores with same base address may overlap writes +static bool mayOverlapWrite(const MachineInstr &MI0, const MachineInstr &MI1, + int64_t &Off0, int64_t &Off1) { + const MachineOperand &Base0 = AArch64InstrInfo::getLdStBaseOp(MI0); + const MachineOperand &Base1 = AArch64InstrInfo::getLdStBaseOp(MI1); + + // May overlapping writes if two store instructions without same base + if (!Base0.isIdenticalTo(Base1)) + return true; + + int StoreSize0 = AArch64InstrInfo::getMemScale(MI0); + int StoreSize1 = AArch64InstrInfo::getMemScale(MI1); + Off0 = AArch64InstrInfo::hasUnscaledLdStOffset(MI0.getOpcode()) + ? AArch64InstrInfo::getLdStOffsetOp(MI0).getImm() + : AArch64InstrInfo::getLdStOffsetOp(MI0).getImm() * StoreSize0; + Off1 = AArch64InstrInfo::hasUnscaledLdStOffset(MI1.getOpcode()) + ? AArch64InstrInfo::getLdStOffsetOp(MI1).getImm() + : AArch64InstrInfo::getLdStOffsetOp(MI1).getImm() * StoreSize1; + + const MachineInstr &MI = (Off0 < Off1) ? MI0 : MI1; + int Multiples = AArch64InstrInfo::isPairedLdSt(MI) ? 2 : 1; + int StoreSize = AArch64InstrInfo::getMemScale(MI) * Multiples; + + return llabs(Off0 - Off1) < StoreSize; +} + bool AArch64PostRASchedStrategy::tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) { bool OriginalResult = PostGenericScheduler::tryCandidate(Cand, TryCand); @@ -18,20 +65,16 @@ if (Cand.isValid()) { MachineInstr *Instr0 = TryCand.SU->getInstr(); MachineInstr *Instr1 = Cand.SU->getInstr(); - // When dealing with two STPqi's. - if (Instr0 && Instr1 && Instr0->getOpcode() == Instr1->getOpcode () && - Instr0->getOpcode() == AArch64::STPQi) - { - MachineOperand &Base0 = Instr0->getOperand(2); - MachineOperand &Base1 = Instr1->getOperand(2); - int64_t Off0 = Instr0->getOperand(3).getImm(); - int64_t Off1 = Instr1->getOperand(3).getImm(); - // With the same base address and non-overlapping writes. - if (Base0.isIdenticalTo(Base1) && llabs (Off0 - Off1) >= 2) { - TryCand.Reason = NodeOrder; - // Order them by ascending offsets. - return Off0 < Off1; - } + + if (!needReorderStoreMI(Instr0) || !needReorderStoreMI(Instr1)) + return OriginalResult; + + int64_t Off0, Off1; + // With the same base address and non-overlapping writes. + if (!mayOverlapWrite(*Instr0, *Instr1, Off0, Off1)) { + TryCand.Reason = NodeOrder; + // Order them by ascending offsets. + return Off0 < Off1; } } Index: llvm/test/CodeGen/AArch64/aarch64-sched-store.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-sched-store.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64 -mcpu=cortex-a57 -mattr=+slow-paired-128 < %s | FileCheck %s --check-prefixes=DEFAULT +; RUN: llc -mtriple aarch64 -mcpu=cortex-a57 -mattr=+slow-paired-128 -mattr=+ascend-store-address < %s | FileCheck %s --check-prefixes=ASCEND + +target triple = "aarch64-unknown-linux-gnu" + +define dso_local void @memset_unroll2(double* nocapture %array, i64 %size) { +; DEFAULT-LABEL: memset_unroll2: +; DEFAULT: // %bb.0: // %entry +; DEFAULT-NEXT: fmov v0.2d, #2.00000000 +; DEFAULT-NEXT: add x8, x0, #64 +; DEFAULT-NEXT: .p2align 4, 0x0, 8 +; DEFAULT-NEXT: .LBB0_1: // %vector.body +; DEFAULT-NEXT: // =>This Inner Loop Header: Depth=1 +; DEFAULT-NEXT: stur q0, [x8, #-64] +; DEFAULT-NEXT: subs x1, x1, #4 +; DEFAULT-NEXT: stur q0, [x8, #-48] +; DEFAULT-NEXT: str q0, [x8] +; DEFAULT-NEXT: str q0, [x8, #16] +; DEFAULT-NEXT: str q0, [x8, #32] +; DEFAULT-NEXT: str q0, [x8, #48] +; DEFAULT-NEXT: stur q0, [x8, #-32] +; DEFAULT-NEXT: stur q0, [x8, #-16] +; DEFAULT-NEXT: add x8, x8, #128 +; DEFAULT-NEXT: b.ne .LBB0_1 +; DEFAULT-NEXT: // %bb.2: // %cleanup +; DEFAULT-NEXT: ret +; +; ASCEND-LABEL: memset_unroll2: +; ASCEND: // %bb.0: // %entry +; ASCEND-NEXT: fmov v0.2d, #2.00000000 +; ASCEND-NEXT: add x8, x0, #64 +; ASCEND-NEXT: .p2align 4, 0x0, 8 +; ASCEND-NEXT: .LBB0_1: // %vector.body +; ASCEND-NEXT: // =>This Inner Loop Header: Depth=1 +; ASCEND-NEXT: stur q0, [x8, #-64] +; ASCEND-NEXT: subs x1, x1, #4 +; ASCEND-NEXT: stur q0, [x8, #-48] +; ASCEND-NEXT: stur q0, [x8, #-32] +; ASCEND-NEXT: stur q0, [x8, #-16] +; ASCEND-NEXT: str q0, [x8] +; ASCEND-NEXT: str q0, [x8, #16] +; ASCEND-NEXT: str q0, [x8, #32] +; ASCEND-NEXT: str q0, [x8, #48] +; ASCEND-NEXT: add x8, x8, #128 +; ASCEND-NEXT: b.ne .LBB0_1 +; ASCEND-NEXT: // %bb.2: // %cleanup +; ASCEND-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index16, %vector.body ] + %niter = phi i64 [ %size, %entry ], [ %niter.nsub.3, %vector.body ] + %array0 = getelementptr inbounds double, double* %array, i64 %index + %array0.cast = bitcast double* %array0 to <2 x double>* + store <2 x double> , <2 x double>* %array0.cast, align 8 + %array2 = getelementptr inbounds double, double* %array0, i64 2 + %array2.cast = bitcast double* %array2 to <2 x double>* + store <2 x double> , <2 x double>* %array2.cast, align 8 + %index4 = or i64 %index, 4 + %array4 = getelementptr inbounds double, double* %array, i64 %index4 + %array4.cast = bitcast double* %array4 to <2 x double>* + store <2 x double> , <2 x double>* %array4.cast, align 8 + %array6 = getelementptr inbounds double, double* %array4, i64 2 + %array6.cast = bitcast double* %array6 to <2 x double>* + store <2 x double> , <2 x double>* %array6.cast, align 8 + %index8 = or i64 %index, 8 + %array8 = getelementptr inbounds double, double* %array, i64 %index8 + %array8.cast = bitcast double* %array8 to <2 x double>* + store <2 x double> , <2 x double>* %array8.cast, align 8 + %array10 = getelementptr inbounds double, double* %array8, i64 2 + %array10.cast = bitcast double* %array10 to <2 x double>* + store <2 x double> , <2 x double>* %array10.cast, align 8 + %index12 = or i64 %index, 12 + %array12 = getelementptr inbounds double, double* %array, i64 %index12 + %array12.cast = bitcast double* %array12 to <2 x double>* + store <2 x double> , <2 x double>* %array12.cast, align 8 + %array14 = getelementptr inbounds double, double* %array12, i64 2 + %array14.cast = bitcast double* %array14 to <2 x double>* + store <2 x double> , <2 x double>* %array14.cast, align 8 + %index16 = add i64 %index, 16 + %niter.nsub.3 = add i64 %niter, -4 + %niter.ncmp.3 = icmp eq i64 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %cleanup, label %vector.body + +cleanup: ; preds = %vector.body + ret void +}