diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -660,6 +660,7 @@ static inline bool isLoopStart(MachineInstr &MI) { return MI.getOpcode() == ARM::t2DoLoopStart || + MI.getOpcode() == ARM::t2DoLoopStartTP || MI.getOpcode() == ARM::t2WhileLoopStart; } diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -5949,8 +5949,8 @@ // Be conservative with ARMv8.1 MVE instructions. if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart || - Opc == ARM::t2WhileLoopStart || Opc == ARM::t2LoopDec || - Opc == ARM::t2LoopEnd) + Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart || + Opc == ARM::t2LoopDec || Opc == ARM::t2LoopEnd) return outliner::InstrType::Illegal; const MCInstrDesc &MCID = MI.getDesc(); diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -5427,6 +5427,9 @@ t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br, [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>; +def t2DoLoopStartTP : + t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>; + let hasSideEffects = 0 in def t2LoopDec : t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size), diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -101,6 +101,10 @@ hasVPRUse(&MI); } +static bool isDo(MachineInstr *MI) { + return MI->getOpcode() != ARM::t2WhileLoopStart; +} + namespace { using InstSet = SmallPtrSetImpl; @@ -431,12 +435,11 @@ MachineOperand &getLoopStartOperand() { if (IsTailPredicationLegal()) return TPNumElements; - return Start->getOpcode() == ARM::t2DoLoopStart ? Start->getOperand(1) - : Start->getOperand(0); + return isDo(Start) ? Start->getOperand(1) : Start->getOperand(0); } unsigned getStartOpcode() const { - bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; + bool IsDo = isDo(Start); if (!IsTailPredicationLegal()) return IsDo ? ARM::t2DLS : ARM::t2WLS; @@ -622,12 +625,10 @@ // count instead of iteration count, won't affect any other instructions // than the LoopStart and LoopDec. // TODO: We should try to insert the [W|D]LSTP after any of the other uses. - Register StartReg = Start->getOpcode() == ARM::t2DoLoopStart - ? Start->getOperand(1).getReg() - : Start->getOperand(0).getReg(); + Register StartReg = isDo(Start) ? Start->getOperand(1).getReg() + : Start->getOperand(0).getReg(); if (StartInsertPt == Start && StartReg == ARM::LR) { - if (auto *IterCount = RDA.getMIOperand( - Start, Start->getOpcode() == ARM::t2DoLoopStart ? 1 : 0)) { + if (auto *IterCount = RDA.getMIOperand(Start, isDo(Start) ? 1 : 0)) { SmallPtrSet Uses; RDA.getGlobalUses(IterCount, MCRegister::from(ARM::LR), Uses); for (auto *Use : Uses) { @@ -644,53 +645,88 @@ // elements is provided to the vctp instruction, so we need to check that // we can use this register at InsertPt. MachineInstr *VCTP = VCTPs.back(); - TPNumElements = VCTP->getOperand(1); - MCRegister NumElements = TPNumElements.getReg().asMCReg(); - - // If the register is defined within loop, then we can't perform TP. - // TODO: Check whether this is just a mov of a register that would be - // available. - if (RDA.hasLocalDefBefore(VCTP, NumElements)) { - LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n"); - return false; - } + if (Start->getOpcode() == ARM::t2DoLoopStartTP) { + TPNumElements = Start->getOperand(2); + StartInsertPt = Start; + StartInsertBB = Start->getParent(); + } else { + TPNumElements = VCTP->getOperand(1); + MCRegister NumElements = TPNumElements.getReg().asMCReg(); + + // If the register is defined within loop, then we can't perform TP. + // TODO: Check whether this is just a mov of a register that would be + // available. + if (RDA.hasLocalDefBefore(VCTP, NumElements)) { + LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n"); + return false; + } - // The element count register maybe defined after InsertPt, in which case we - // need to try to move either InsertPt or the def so that the [w|d]lstp can - // use the value. - - if (StartInsertPt != StartInsertBB->end() && - !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) { - if (auto *ElemDef = RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) { - if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) { - ElemDef->removeFromParent(); - StartInsertBB->insert(StartInsertPt, ElemDef); - LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: " - << *ElemDef); - } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) { - StartInsertPt->removeFromParent(); - StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), - &*StartInsertPt); - LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); - } else { - // If we fail to move an instruction and the element count is provided - // by a mov, use the mov operand if it will have the same value at the - // insertion point - MachineOperand Operand = ElemDef->getOperand(1); - if (isMovRegOpcode(ElemDef->getOpcode()) && - RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) == - RDA.getUniqueReachingMIDef(&*StartInsertPt, - Operand.getReg().asMCReg())) { - TPNumElements = Operand; - NumElements = TPNumElements.getReg(); - } else { + // The element count register maybe defined after InsertPt, in which case we + // need to try to move either InsertPt or the def so that the [w|d]lstp can + // use the value. + + if (StartInsertPt != StartInsertBB->end() && + !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) { + if (auto *ElemDef = + RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) { + if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) { + ElemDef->removeFromParent(); + StartInsertBB->insert(StartInsertPt, ElemDef); LLVM_DEBUG(dbgs() - << "ARM Loops: Unable to move element count to loop " - << "start instruction.\n"); - return false; + << "ARM Loops: Moved element count def: " << *ElemDef); + } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) { + StartInsertPt->removeFromParent(); + StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), + &*StartInsertPt); + LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); + } else { + // If we fail to move an instruction and the element count is provided + // by a mov, use the mov operand if it will have the same value at the + // insertion point + MachineOperand Operand = ElemDef->getOperand(1); + if (isMovRegOpcode(ElemDef->getOpcode()) && + RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) == + RDA.getUniqueReachingMIDef(&*StartInsertPt, + Operand.getReg().asMCReg())) { + TPNumElements = Operand; + NumElements = TPNumElements.getReg(); + } else { + LLVM_DEBUG(dbgs() + << "ARM Loops: Unable to move element count to loop " + << "start instruction.\n"); + return false; + } } } } + + // Especially in the case of while loops, InsertBB may not be the + // preheader, so we need to check that the register isn't redefined + // before entering the loop. + auto CannotProvideElements = [this](MachineBasicBlock *MBB, + MCRegister NumElements) { + if (MBB->empty()) + return false; + // NumElements is redefined in this block. + if (RDA.hasLocalDefBefore(&MBB->back(), NumElements)) + return true; + + // Don't continue searching up through multiple predecessors. + if (MBB->pred_size() > 1) + return true; + + return false; + }; + + // Search backwards for a def, until we get to InsertBB. + MachineBasicBlock *MBB = Preheader; + while (MBB && MBB != StartInsertBB) { + if (CannotProvideElements(MBB, NumElements)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n"); + return false; + } + MBB = *MBB->pred_begin(); + } } // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect @@ -717,34 +753,6 @@ if (CannotInsertWDLSTPBetween(StartInsertPt, StartInsertBB->end())) return false; - // Especially in the case of while loops, InsertBB may not be the - // preheader, so we need to check that the register isn't redefined - // before entering the loop. - auto CannotProvideElements = [this](MachineBasicBlock *MBB, - MCRegister NumElements) { - if (MBB->empty()) - return false; - // NumElements is redefined in this block. - if (RDA.hasLocalDefBefore(&MBB->back(), NumElements)) - return true; - - // Don't continue searching up through multiple predecessors. - if (MBB->pred_size() > 1) - return true; - - return false; - }; - - // Search backwards for a def, until we get to InsertBB. - MachineBasicBlock *MBB = Preheader; - while (MBB && MBB != StartInsertBB) { - if (CannotProvideElements(MBB, NumElements)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n"); - return false; - } - MBB = *MBB->pred_begin(); - } - // Check that the value change of the element count is what we expect and // that the predication will be equivalent. For this we need: // NumElements = NumElements - VectorWidth. The sub will be a sub immediate @@ -753,7 +761,7 @@ return -getAddSubImmediate(*MI) == ExpectedVecWidth; }; - MBB = VCTP->getParent(); + MachineBasicBlock *MBB = VCTP->getParent(); // Remove modifications to the element count since they have no purpose in a // tail predicated loop. Explicitly refer to the vctp operand no matter which // register NumElements has been assigned to, since that is what the @@ -1062,8 +1070,7 @@ InstSet &ToRemove) { // For a t2DoLoopStart it is always valid to use the start insertion point. // For WLS we can define LR if LR already contains the same value. - if (Start->getOpcode() == ARM::t2DoLoopStart || - Start->getOperand(0).getReg() == ARM::LR) { + if (isDo(Start) || Start->getOperand(0).getReg() == ARM::LR) { InsertPt = MachineBasicBlock::iterator(Start); InsertBB = Start->getParent(); return true; @@ -1434,8 +1441,8 @@ LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n"); - MachineInstr *Def = RDA->getMIOperand( - LoLoop.Start, LoLoop.Start->getOpcode() == ARM::t2DoLoopStart ? 1 : 0); + MachineInstr *Def = + RDA->getMIOperand(LoLoop.Start, isDo(LoLoop.Start) ? 1 : 0); if (!Def) { LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n"); return; @@ -1457,7 +1464,6 @@ MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt; MachineInstr *Start = LoLoop.Start; MachineBasicBlock *MBB = LoLoop.StartInsertBB; - bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; unsigned Opc = LoLoop.getStartOpcode(); MachineOperand &Count = LoLoop.getLoopStartOperand(); @@ -1466,7 +1472,7 @@ MIB.addDef(ARM::LR); MIB.add(Count); - if (!IsDo) + if (!isDo(Start)) MIB.add(Start->getOperand(1)); LoLoop.ToRemove.insert(Start); diff --git a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp --- a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp @@ -6,17 +6,13 @@ // //===----------------------------------------------------------------------===// // -/// \file This pass does a few optimisations related to MVE VPT blocks before -/// register allocation is performed. The goal is to maximize the sizes of the -/// blocks that will be created by the MVE VPT Block Insertion pass (which runs -/// after register allocation). The first optimisation done by this pass is the -/// replacement of "opposite" VCMPs with VPNOTs, so the Block Insertion pass -/// can delete them later to create larger VPT blocks. -/// The second optimisation replaces re-uses of old VCCR values with VPNOTs when -/// inside a block of predicated instructions. This is done to avoid -/// spill/reloads of VPR in the middle of a block, which prevents the Block -/// Insertion pass from creating large blocks. -// +/// \file This pass does a few optimisations related to Tail predicated loops +/// and MVE VPT blocks before register allocation is performed. For VPT blocks +/// the goal is to maximize the sizes of the blocks that will be created by the +/// MVE VPT Block Insertion pass (which runs after register allocation). For +/// tail predicated loops we transform the loop into something that will +/// hopefully make the backend ARMLowOverheadLoops pass's job easier. +/// //===----------------------------------------------------------------------===// #include "ARM.h" @@ -25,9 +21,12 @@ #include "Thumb2InstrInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include @@ -46,11 +45,20 @@ bool runOnMachineFunction(MachineFunction &Fn) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } + StringRef getPassName() const override { - return "ARM MVE VPT Optimisation Pass"; + return "ARM MVE TailPred and VPT Optimisation Pass"; } private: + bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT); MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User, @@ -64,8 +72,177 @@ } // end anonymous namespace -INITIALIZE_PASS(MVEVPTOptimisations, DEBUG_TYPE, - "ARM MVE VPT Optimisations pass", false, false) +INITIALIZE_PASS_BEGIN(MVEVPTOptimisations, DEBUG_TYPE, + "ARM MVE TailPred and VPT Optimisations pass", false, + false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(MVEVPTOptimisations, DEBUG_TYPE, + "ARM MVE TailPred and VPT Optimisations pass", false, false) + +static MachineInstr *LookThroughCOPY(MachineInstr *MI, + MachineRegisterInfo *MRI) { + while (MI && MI->getOpcode() == TargetOpcode::COPY && + MI->getOperand(1).getReg().isVirtual()) + MI = MRI->getVRegDef(MI->getOperand(1).getReg()); + return MI; +} + +// Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and +// corresponding PHI that make up a low overhead loop. Only handles 'do' loops +// at the moment, returning a t2DoLoopStart in LoopStart. +static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI, + MachineInstr *&LoopStart, MachineInstr *&LoopPhi, + MachineInstr *&LoopDec, MachineInstr *&LoopEnd) { + MachineBasicBlock *Header = ML->getHeader(); + MachineBasicBlock *Latch = ML->getLoopLatch(); + if (!Header || !Latch) { + LLVM_DEBUG(dbgs() << " no Loop Latch or Header\n"); + return false; + } + + // Find the loop end from the terminators. + LoopEnd = nullptr; + for (auto &T : Latch->terminators()) { + if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(1).getMBB() == Header) { + LoopEnd = &T; + break; + } + } + if (!LoopEnd) { + LLVM_DEBUG(dbgs() << " no LoopEnd\n"); + return false; + } + LLVM_DEBUG(dbgs() << " found loop end: " << *LoopEnd); + + // Find the dec from the use of the end. There may be copies between + // instructions. We expect the loop to loop like: + // $vs = t2DoLoopStart ... + // loop: + // $vp = phi [ $vs ], [ $vd ] + // ... + // $vd = t2LoopDec $vp + // ... + // t2LoopEnd $vd, loop + LoopDec = + LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI); + if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) { + LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n"); + return false; + } + LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec); + + LoopPhi = + LookThroughCOPY(MRI->getVRegDef(LoopDec->getOperand(1).getReg()), MRI); + if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI || + LoopPhi->getNumOperands() != 5 || + (LoopPhi->getOperand(2).getMBB() != Latch && + LoopPhi->getOperand(4).getMBB() != Latch)) { + LLVM_DEBUG(dbgs() << " didn't find PHI where we expected!\n"); + return false; + } + LLVM_DEBUG(dbgs() << " found loop phi: " << *LoopPhi); + + Register StartReg = LoopPhi->getOperand(2).getMBB() == Latch + ? LoopPhi->getOperand(3).getReg() + : LoopPhi->getOperand(1).getReg(); + LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI); + if (!LoopStart || LoopStart->getOpcode() != ARM::t2DoLoopStart) { + LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n"); + return false; + } + LLVM_DEBUG(dbgs() << " found loop start: " << *LoopStart); + + return true; +} + +// Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP +// instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP +// instruction, making the backend ARMLowOverheadLoops passes job of finding the +// VCTP operand much simpler. +bool MVEVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, + MachineDominatorTree *DT) { + LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop " + << ML->getHeader()->getName() << "\n"); + + // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's + // in the loop. + MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; + if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) + return false; + + SmallVector VCTPs; + for (MachineBasicBlock *BB : ML->blocks()) + for (MachineInstr &MI : *BB) + if (isVCTP(&MI)) + VCTPs.push_back(&MI); + + if (VCTPs.empty()) { + LLVM_DEBUG(dbgs() << " no VCTPs\n"); + return false; + } + + // Check all VCTPs are the same. + MachineInstr *FirstVCTP = *VCTPs.begin(); + for (MachineInstr *VCTP : VCTPs) { + LLVM_DEBUG(dbgs() << " with VCTP " << *VCTP); + if (VCTP->getOpcode() != FirstVCTP->getOpcode() || + VCTP->getOperand(0).getReg() != FirstVCTP->getOperand(0).getReg()) { + LLVM_DEBUG(dbgs() << " VCTP's are not identical\n"); + return false; + } + } + + // Check for the register being used can be setup before the loop. We expect + // this to be: + // $vx = ... + // loop: + // $vp = PHI [ $vx ], [ $vd ] + // .. + // $vpr = VCTP $vp + // .. + // $vd = t2SUBri $vp, #n + // .. + Register CountReg = FirstVCTP->getOperand(1).getReg(); + if (!CountReg.isVirtual()) { + LLVM_DEBUG(dbgs() << " cannot determine VCTP PHI\n"); + return false; + } + MachineInstr *Phi = LookThroughCOPY(MRI->getVRegDef(CountReg), MRI); + if (!Phi || Phi->getOpcode() != TargetOpcode::PHI || + Phi->getNumOperands() != 5 || + (Phi->getOperand(2).getMBB() != ML->getLoopLatch() && + Phi->getOperand(4).getMBB() != ML->getLoopLatch())) { + LLVM_DEBUG(dbgs() << " cannot determine VCTP Count\n"); + return false; + } + CountReg = Phi->getOperand(2).getMBB() == ML->getLoopLatch() + ? Phi->getOperand(3).getReg() + : Phi->getOperand(1).getReg(); + + // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of + // the preheader and add the new CountReg to it. We attempt to place it late + // in the preheader, but may need to move that earlier based on uses. + MachineBasicBlock *MBB = LoopStart->getParent(); + MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator(); + for (MachineInstr &Use : + MRI->use_instructions(LoopStart->getOperand(0).getReg())) + if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) || + !DT->dominates(ML->getHeader(), Use.getParent())) + InsertPt = &Use; + + MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), + TII->get(ARM::t2DoLoopStartTP)) + .add(LoopStart->getOperand(0)) + .add(LoopStart->getOperand(1)) + .addReg(CountReg); + LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with " + << *MI.getInstr()); + MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass); + LoopStart->eraseFromParent(); + + return true; +} // Returns true if Opcode is any VCMP Opcode. static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; } @@ -484,11 +661,16 @@ TII = static_cast(STI.getInstrInfo()); MRI = &Fn.getRegInfo(); + MachineLoopInfo *MLI = &getAnalysis(); + MachineDominatorTree *DT = &getAnalysis(); LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n" << "********** Function: " << Fn.getName() << '\n'); bool Modified = false; + for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) + Modified |= ConvertTailPredLoop(ML, DT); + for (MachineBasicBlock &MBB : Fn) { Modified |= ReplaceVCMPsByVPNOTs(MBB); Modified |= ReduceOldVCCRValueUses(MBB); diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -96,7 +96,8 @@ ; CHECK-NEXT: Machine code sinking ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions -; CHECK-NEXT: MVE VPT Optimisation Pass +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: MVE TailPred and VPT Optimisation Pass ; CHECK-NEXT: ARM MLA / MLS expansion pass ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: ARM pre- register allocation load / store optimization pass diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll @@ -11,7 +11,7 @@ ; CHECK-NEXT: ldrd r2, r3, [r0, #8] ; CHECK-NEXT: rsb r12, r12, r4, lsl #1 ; CHECK-NEXT: mov r4, r12 -; CHECK-NEXT: dlstp.16 lr, r4 +; CHECK-NEXT: dlstp.16 lr, r12 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r3], #16 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -16,7 +16,7 @@ ; CHECK-NEXT: adds r4, #3 ; CHECK-NEXT: add.w r12, r3, r4, lsr #2 ; CHECK-NEXT: mov r3, r1 -; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: .LBB0_1: @ %do.body.i ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -414,11 +414,11 @@ ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB5_8 ; CHECK-NEXT: .LBB5_4: @ %vector.ph -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 ; CHECK-NEXT: vldrb.u32 q1, [r1], #4 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -716,11 +716,11 @@ ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB7_8 ; CHECK-NEXT: .LBB7_4: @ %vector.ph -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 ; CHECK-NEXT: vldrb.u32 q1, [r1], #4 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 @@ -1018,11 +1018,11 @@ ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB9_8 ; CHECK-NEXT: .LBB9_4: @ %vector.ph -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vmlas.u32 q1, q0, r2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -550,9 +550,9 @@ ; CHECK-NEXT: cbz r2, .LBB7_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #7 +; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: add.w lr, r4, r3, lsr #3 diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll @@ -11,8 +11,8 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -74,8 +74,8 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -138,8 +138,8 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -201,8 +201,8 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -396,28 +396,19 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB6_1: @ %vector.ph -; CHECK-NEXT: add.w r12, r3, #3 -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: bic r12, r12, #3 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: add.w lr, lr, r12, lsr #2 ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 -; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 ; CHECK-NEXT: vfms.f32 q3, q2, q1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q3, [r2], #16 -; CHECK-NEXT: le lr, .LBB6_2 +; CHECK-NEXT: vstrw.32 q3, [r2], #16 +; CHECK-NEXT: letp lr, .LBB6_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: @@ -471,28 +462,19 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB7_1: @ %vector.ph -; CHECK-NEXT: add.w r12, r3, #3 -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: bic r12, r12, #3 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: add.w lr, lr, r12, lsr #2 ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 -; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vfms.f32 q3, q2, q1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q3, [r2], #16 -; CHECK-NEXT: le lr, .LBB7_2 +; CHECK-NEXT: vstrw.32 q3, [r2], #16 +; CHECK-NEXT: letp lr, .LBB7_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: @@ -611,27 +593,18 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB9_1: @ %vector.ph -; CHECK-NEXT: add.w r12, r3, #3 -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: bic r12, r12, #3 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: add.w lr, lr, r12, lsr #2 ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB9_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 -; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vfms.f32 q2, q1, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q2, [r2], #16 -; CHECK-NEXT: le lr, .LBB9_2 +; CHECK-NEXT: vstrw.32 q2, [r2], #16 +; CHECK-NEXT: letp lr, .LBB9_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: @@ -686,8 +659,8 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB10_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -751,8 +724,8 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB11_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB11_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: adr r0, .LCPI0_0 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: movw lr, #1250 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 ; CHECK-NEXT: .LBB0_1: @ %vector.body @@ -78,14 +78,14 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: add.w r4, r0, r3, lsl #2 ; CHECK-NEXT: adr r0, .LCPI1_0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: add.w r12, r3, #4 ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vmov.i32 q0, #0x14 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 @@ -156,8 +156,8 @@ ; CHECK-NEXT: adr r0, .LCPI2_0 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: movw lr, #1250 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 ; CHECK-NEXT: vmov.i32 q2, #0x3 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -15,31 +15,24 @@ ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: ldr r5, [r0, #8] ; CHECK-NEXT: ldr r3, [r0] -; CHECK-NEXT: adds r0, r5, #3 -; CHECK-NEXT: bic r0, r0, #3 ; CHECK-NEXT: add.w r4, r3, r5, lsl #2 -; CHECK-NEXT: subs r3, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsl.w r9, r5, #2 -; CHECK-NEXT: add.w r8, r0, r3, lsr #2 ; CHECK-NEXT: .LBB0_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 -; CHECK-NEXT: dls lr, r8 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: dlstp.32 lr, r5 ; CHECK-NEXT: mov r7, r1 ; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: mov r6, r5 ; CHECK-NEXT: .LBB0_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.32 r6 -; CHECK-NEXT: subs r6, #4 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q1, [r7], #16 -; CHECK-NEXT: vldrwt.u32 q2, [r3], #16 -; CHECK-NEXT: vfmat.f32 q0, q2, q1 -; CHECK-NEXT: le lr, .LBB0_3 +; CHECK-NEXT: vldrw.u32 q1, [r7], #16 +; CHECK-NEXT: vldrw.u32 q2, [r3], #16 +; CHECK-NEXT: vfma.f32 q0, q2, q1 +; CHECK-NEXT: letp lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: vadd.f32 s4, s2, s3 @@ -124,29 +117,29 @@ ; CHECK-NEXT: ldr r3, [r0] ; CHECK-NEXT: add.w r0, r12, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r6, r3, r12, lsl #2 +; CHECK-NEXT: add.w r5, r3, r12, lsl #2 ; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: add.w r7, r3, r12, lsl #3 -; CHECK-NEXT: lsl.w r10, r12, #3 +; CHECK-NEXT: lsl.w r9, r12, #3 ; CHECK-NEXT: add.w r8, r4, r0, lsr #2 ; CHECK-NEXT: .LBB1_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 ; CHECK-NEXT: dls lr, r8 +; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload ; CHECK-NEXT: add.w r11, r4, #1 -; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: mov r9, r12 +; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: .LBB1_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.32 r9 -; CHECK-NEXT: sub.w r9, r9, #4 +; CHECK-NEXT: vctp.32 r10 +; CHECK-NEXT: sub.w r10, r10, #4 ; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q2, [r5], #16 +; CHECK-NEXT: vldrwt.u32 q2, [r6], #16 ; CHECK-NEXT: vldrwt.u32 q3, [r3], #16 ; CHECK-NEXT: vfmat.f32 q1, q3, q2 ; CHECK-NEXT: vldrwt.u32 q3, [r0], #16 @@ -158,9 +151,9 @@ ; CHECK-NEXT: vadd.f32 s8, s2, s3 ; CHECK-NEXT: add.w r0, r2, r11, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: add r6, r10 +; CHECK-NEXT: add r5, r9 ; CHECK-NEXT: vadd.f32 s2, s6, s7 -; CHECK-NEXT: add r7, r10 +; CHECK-NEXT: add r7, r9 ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: vadd.f32 s0, s0, s8 ; CHECK-NEXT: vadd.f32 s2, s4, s2 @@ -274,15 +267,14 @@ ; CHECK-NEXT: .LBB2_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: ldrd r0, r10, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: add.w r9, r5, #2 ; CHECK-NEXT: add.w r11, r5, #1 ; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: ldr.w r10, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: mov r4, r8 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov q1, q0 @@ -444,22 +436,21 @@ ; CHECK-NEXT: .LBB3_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r5, r9 -; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: adds r0, r6, #3 ; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r0, r6, #2 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r4, r10 -; CHECK-NEXT: ldr.w r11, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: adds r0, r6, #1 ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: ldrd r0, r11, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: mov r4, r10 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: .LBB3_3: @ %vector.body @@ -645,19 +636,18 @@ ; CHECK-NEXT: .LBB4_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB4_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: add.w r10, r0, #2 -; CHECK-NEXT: adds r7, r0, #1 -; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: ldrd r1, r11, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: add.w r10, r0, #2 +; CHECK-NEXT: adds r7, r0, #1 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: ldr.w r11, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: vmov q4, q1 @@ -866,21 +856,20 @@ ; CHECK-NEXT: .LBB5_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: add.w r11, r0, #2 -; CHECK-NEXT: adds r4, r0, #1 -; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: adds r1, r0, #5 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: ldrd r1, r8, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: add.w r11, r0, #2 +; CHECK-NEXT: adds r4, r0, #1 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: ldr.w r8, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov q5, q1 @@ -1113,11 +1102,6 @@ ; CHECK-NEXT: .LBB6_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: adds r4, r0, #2 -; CHECK-NEXT: add.w r8, r0, #1 -; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: adds r1, r0, #6 ; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #5 @@ -1126,10 +1110,14 @@ ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: ldrd r3, r1, [sp, #16] @ 8-byte Folded Reload +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: adds r4, r0, #2 +; CHECK-NEXT: add.w r8, r0, #1 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov q5, q2 ; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vmov q6, q2 @@ -1401,24 +1389,23 @@ ; CHECK-NEXT: .LBB7_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB7_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: adds r4, r0, #3 -; CHECK-NEXT: add.w r8, r0, #2 -; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: adds r1, r0, #7 ; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #6 +; CHECK-NEXT: ldrd r3, r10, [sp, #16] @ 8-byte Folded Reload ; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #5 ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: ldr.w r12, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: adds r4, r0, #3 +; CHECK-NEXT: add.w r8, r0, #2 +; CHECK-NEXT: adds r1, r0, #1 ; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: ldr.w r10, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #1 ; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov q7, q3 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -743,12 +743,12 @@ ; CHECK-NEXT: ldr.w r11, [sp, #88] ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: dlstp.16 lr, r11 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: dlstp.16 lr, r11 +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mla r3, r9, r11, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r7, r5, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: .LBB5_7: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB5_5 Depth=1 @@ -757,10 +757,10 @@ ; CHECK-NEXT: vadd.i16 q1, q0, r4 ; CHECK-NEXT: vldrb.s16 q0, [r3], #8 ; CHECK-NEXT: vmlava.s16 r12, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r5], #8 +; CHECK-NEXT: vldrb.s16 q1, [r0], #8 ; CHECK-NEXT: vadd.i16 q1, q1, r4 ; CHECK-NEXT: vmlava.s16 r6, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r0], #8 +; CHECK-NEXT: vldrb.s16 q1, [r5], #8 ; CHECK-NEXT: vadd.i16 q1, q1, r4 ; CHECK-NEXT: vmlava.s16 r8, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r1], #8 @@ -915,12 +915,12 @@ ; CHECK-NEXT: ldr.w r11, [sp, #88] ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: dlstp.16 lr, r11 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: dlstp.16 lr, r11 +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mla r3, r9, r11, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r7, r5, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: .LBB6_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB6_3 Depth=1 @@ -929,10 +929,10 @@ ; CHECK-NEXT: vadd.i16 q1, q0, r4 ; CHECK-NEXT: vldrb.s16 q0, [r3], #8 ; CHECK-NEXT: vmlava.s16 r12, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r5], #8 +; CHECK-NEXT: vldrb.s16 q1, [r0], #8 ; CHECK-NEXT: vadd.i16 q1, q1, r4 ; CHECK-NEXT: vmlava.s16 r6, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r0], #8 +; CHECK-NEXT: vldrb.s16 q1, [r5], #8 ; CHECK-NEXT: vadd.i16 q1, q1, r4 ; CHECK-NEXT: vmlava.s16 r8, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r1], #8