diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -367,7 +367,8 @@ bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override { return MI->getOpcode() == ARM::t2LoopEndDec || MI->getOpcode() == ARM::t2DoLoopStartTP || - MI->getOpcode() == ARM::t2WhileLoopStartLR; + MI->getOpcode() == ARM::t2WhileLoopStartLR || + MI->getOpcode() == ARM::t2WhileLoopStartTP; } private: @@ -645,12 +646,6 @@ Opc == ARM::t2BR_JT; } -static inline bool isLowOverheadTerminatorOpcode(int Opc) { - return Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart || - Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2LoopEnd || - Opc == ARM::t2LoopEndDec; -} - static inline bool isIndirectBranchOpcode(int Opc) { return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND; diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -6122,8 +6122,9 @@ // Be conservative with ARMv8.1 MVE instructions. if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart || Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart || - Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2LoopDec || - Opc == ARM::t2LoopEnd || Opc == ARM::t2LoopEndDec) + Opc == ARM::t2WhileLoopStartLR || Opc == ARM::t2WhileLoopStartTP || + Opc == ARM::t2LoopDec || Opc == ARM::t2LoopEnd || + Opc == ARM::t2LoopEndDec) return outliner::InstrType::Illegal; const MCInstrDesc &MCID = MI.getDesc(); diff --git a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp --- a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp +++ b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp @@ -15,6 +15,7 @@ #include "ARMBaseInstrInfo.h" #include "ARMBasicBlockInfo.h" #include "ARMSubtarget.h" +#include "MVETailPredUtils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" @@ -61,13 +62,13 @@ static MachineInstr *findWLSInBlock(MachineBasicBlock *MBB) { for (auto &Terminator : MBB->terminators()) { - if (Terminator.getOpcode() == ARM::t2WhileLoopStartLR) + if (isWhileLoopStart(Terminator)) return &Terminator; } return nullptr; } -/// Find t2WhileLoopStartLR in the loop predecessor BB or otherwise in its only +/// Find WhileLoopStart in the loop predecessor BB or otherwise in its only /// predecessor. If found, returns (BB, WLS Instr) pair, otherwise a null pair. static MachineInstr *findWLS(MachineLoop *ML) { MachineBasicBlock *Predecessor = ML->getLoopPredecessor(); @@ -93,7 +94,7 @@ return false; MachineBasicBlock *Predecessor = WlsInstr->getParent(); - MachineBasicBlock *LoopExit = WlsInstr->getOperand(2).getMBB(); + MachineBasicBlock *LoopExit = getWhileLoopStartTargetBB(*WlsInstr); // We don't want to move Preheader to before the function's entry block. if (!LoopExit->getPrevNode()) @@ -118,9 +119,9 @@ ++It) { MachineBasicBlock *MBB = &*It; for (auto &Terminator : MBB->terminators()) { - if (Terminator.getOpcode() != ARM::t2WhileLoopStartLR) + if (!isWhileLoopStart(Terminator)) continue; - MachineBasicBlock *WLSTarget = Terminator.getOperand(2).getMBB(); + MachineBasicBlock *WLSTarget = getWhileLoopStartTargetBB(Terminator); // TODO: Analyse the blocks to make a decision if it would be worth // moving Preheader even if we'd introduce a backwards WLS if (WLSTarget == Predecessor) { diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -5479,8 +5479,8 @@ // t2DoLoopStart a pseudo for DLS hardware loops. Lowered into a DLS in // ARMLowOverheadLoops if possible, or reverted to a Mov if not. def t2DoLoopStart : - t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br, - [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>; + t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$tc), 4, IIC_Br, + [(set GPRlr:$X, (int_start_loop_iterations rGPR:$tc))]>; // A pseudo for a DLSTP, created in the MVETPAndVPTOptimizationPass from a // t2DoLoopStart if the loops is tail predicated. Holds both the element @@ -5488,7 +5488,7 @@ // ARMLowOverheadLoops when it is converted to a DLSTP or DLS as required. let isTerminator = 1, hasSideEffects = 1 in def t2DoLoopStartTP : - t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>; + t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$tc, rGPR:$elts), 4, IIC_Br, []>; // Setup for a t2WhileLoopStart. A pair of t2WhileLoopSetup and t2WhileLoopStart // will be created post-ISel from a llvm.test.start.loop.iterations. This @@ -5496,7 +5496,7 @@ // valid after reg alloc, as it should be lowered during MVETPAndVPTOptimisations // into a t2WhileLoopStartLR (or expanded). def t2WhileLoopSetup : - t2PseudoInst<(outs GPRlr:$lr), (ins rGPR:$elts), 4, IIC_Br, []>; + t2PseudoInst<(outs GPRlr:$lr), (ins rGPR:$tc), 4, IIC_Br, []>; // A pseudo to represent the decrement in a low overhead loop. A t2LoopDec and // t2LoopEnd together represent a LE instruction. Ideally these are converted @@ -5511,7 +5511,7 @@ // into a t2WhileLoopStartLR that does both the LR setup and branch. def t2WhileLoopStart : t2PseudoInst<(outs), - (ins GPRlr:$elts, brtarget:$target), + (ins GPRlr:$tc, brtarget:$target), 4, IIC_Br, []>, Sched<[WriteBr]>; @@ -5521,13 +5521,21 @@ // converted into t2CMP and t2Bcc. def t2WhileLoopStartLR : t2PseudoInst<(outs GPRlr:$lr), - (ins rGPR:$elts, brtarget:$target), + (ins rGPR:$tc, brtarget:$target), + 8, IIC_Br, []>, + Sched<[WriteBr]>; + +// Similar to a t2DoLoopStartTP, a t2WhileLoopStartTP is a pseudo for a WLSTP +// holding both the element count and the tripcount of the loop. +def t2WhileLoopStartTP : + t2PseudoInst<(outs GPRlr:$lr), + (ins rGPR:$tc, rGPR:$elts, brtarget:$target), 8, IIC_Br, []>, Sched<[WriteBr]>; // t2LoopEnd - the branch half of a t2LoopDec/t2LoopEnd pair. def t2LoopEnd : - t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target), + t2PseudoInst<(outs), (ins GPRlr:$tc, brtarget:$target), 8, IIC_Br, []>, Sched<[WriteBr]>; // The combination of a t2LoopDec and t2LoopEnd, performing both the LR @@ -5535,7 +5543,7 @@ // LETP in ARMLowOverheadLoops as appropriate, or converted to t2CMP/t2Bcc // if the branches are out of range. def t2LoopEndDec : - t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$elts, brtarget:$target), + t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$tc, brtarget:$target), 8, IIC_Br, []>, Sched<[WriteBr]>; } // end isBranch, isTerminator, hasSideEffects diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -101,10 +101,6 @@ return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI); } -static bool isDo(MachineInstr *MI) { - return MI->getOpcode() != ARM::t2WhileLoopStartLR; -} - namespace { using InstSet = SmallPtrSetImpl; @@ -446,7 +442,7 @@ } unsigned getStartOpcode() const { - bool IsDo = isDo(Start); + bool IsDo = isDoLoopStart(*Start); if (!IsTailPredicationLegal()) return IsDo ? ARM::t2DLS : ARM::t2WLS; @@ -635,7 +631,8 @@ // elements is provided to the vctp instruction, so we need to check that // we can use this register at InsertPt. MachineInstr *VCTP = VCTPs.back(); - if (Start->getOpcode() == ARM::t2DoLoopStartTP) { + if (Start->getOpcode() == ARM::t2DoLoopStartTP || + Start->getOpcode() == ARM::t2WhileLoopStartTP) { TPNumElements = Start->getOperand(2); StartInsertPt = Start; StartInsertBB = Start->getParent(); @@ -778,10 +775,12 @@ } } - // If we converted the LoopStart to a t2DoLoopStartTP, we can also remove any - // extra instructions in the preheader, which often includes a now unused MOV. - if (Start->getOpcode() == ARM::t2DoLoopStartTP && Preheader && - !Preheader->empty() && + // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we + // can also remove any extra instructions in the preheader, which often + // includes a now unused MOV. + if ((Start->getOpcode() == ARM::t2DoLoopStartTP || + Start->getOpcode() == ARM::t2WhileLoopStartTP) && + Preheader && !Preheader->empty() && !RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) { if (auto *Def = RDA.getUniqueReachingMIDef( &Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) { @@ -1045,12 +1044,13 @@ return false; } - if (Start->getOpcode() == ARM::t2WhileLoopStartLR && - (BBUtils->getOffsetOf(Start) > - BBUtils->getOffsetOf(Start->getOperand(2).getMBB()) || - !BBUtils->isBBInRange(Start, Start->getOperand(2).getMBB(), 4094))) { - LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); - return false; + if (isWhileLoopStart(*Start)) { + MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start); + if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) || + !BBUtils->isBBInRange(Start, TargetBB, 4094)) { + LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); + return false; + } } return true; }; @@ -1289,7 +1289,7 @@ // another low register. void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); - MachineBasicBlock *DestBB = MI->getOperand(2).getMBB(); + MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(*MI); unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc; @@ -1426,8 +1426,8 @@ MIB.addDef(ARM::LR); MIB.add(Count); - if (!isDo(Start)) - MIB.add(Start->getOperand(2)); + if (isWhileLoopStart(*Start)) + MIB.addMBB(getWhileLoopStartTargetBB(*Start)); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); NewStart = &*MIB; @@ -1612,7 +1612,7 @@ }; if (LoLoop.Revert) { - if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStartLR) + if (isWhileLoopStart(*LoLoop.Start)) RevertWhile(LoLoop.Start); else RevertDo(LoLoop.Start); @@ -1683,7 +1683,7 @@ Changed = true; for (auto *Start : Starts) { - if (Start->getOpcode() == ARM::t2WhileLoopStartLR) + if (isWhileLoopStart(*Start)) RevertWhile(Start); else RevertDo(Start); diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp --- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -429,7 +429,8 @@ MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) return false; - if (LoopDec != LoopEnd || LoopStart->getOpcode() != ARM::t2DoLoopStart) + if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart && + LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) return false; SmallVector VCTPs; @@ -494,12 +495,16 @@ return false; } - MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), - TII->get(ARM::t2DoLoopStartTP)) - .add(LoopStart->getOperand(0)) - .add(LoopStart->getOperand(1)) - .addReg(CountReg); - (void)MI; + unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart + ? ARM::t2DoLoopStartTP + : ARM::t2WhileLoopStartTP; + MachineInstrBuilder MI = + BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(NewOpc)) + .add(LoopStart->getOperand(0)) + .add(LoopStart->getOperand(1)) + .addReg(CountReg); + if (NewOpc == ARM::t2WhileLoopStartTP) + MI.add(LoopStart->getOperand(2)); LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with " << *MI.getInstr()); MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass); diff --git a/llvm/lib/Target/ARM/MVETailPredUtils.h b/llvm/lib/Target/ARM/MVETailPredUtils.h --- a/llvm/lib/Target/ARM/MVETailPredUtils.h +++ b/llvm/lib/Target/ARM/MVETailPredUtils.h @@ -68,11 +68,26 @@ return false; } -static inline bool isLoopStart(MachineInstr &MI) { +static inline bool isDoLoopStart(const MachineInstr &MI) { return MI.getOpcode() == ARM::t2DoLoopStart || - MI.getOpcode() == ARM::t2DoLoopStartTP || - MI.getOpcode() == ARM::t2WhileLoopStart || - MI.getOpcode() == ARM::t2WhileLoopStartLR; + MI.getOpcode() == ARM::t2DoLoopStartTP; +} + +static inline bool isWhileLoopStart(const MachineInstr &MI) { + return MI.getOpcode() == ARM::t2WhileLoopStart || + MI.getOpcode() == ARM::t2WhileLoopStartLR || + MI.getOpcode() == ARM::t2WhileLoopStartTP; +} + +static inline bool isLoopStart(const MachineInstr &MI) { + return isDoLoopStart(MI) || isWhileLoopStart(MI); +} + +// Return the TargetBB stored in a t2WhileLoopStartLR/t2WhileLoopStartTP. +inline MachineBasicBlock *getWhileLoopStartTargetBB(const MachineInstr &MI) { + assert(isWhileLoopStart(MI) && "Expected WhileLoopStart!"); + unsigned Op = MI.getOpcode() == ARM::t2WhileLoopStartTP ? 3 : 2; + return MI.getOperand(Op).getMBB(); } // WhileLoopStart holds the exit block, so produce a subs Op0, Op1, 0 and then a @@ -84,8 +99,9 @@ unsigned BrOpc = ARM::t2Bcc, bool UseCmp = false) { MachineBasicBlock *MBB = MI->getParent(); - assert(MI->getOpcode() == ARM::t2WhileLoopStartLR && - "Only expected a t2WhileLoopStartLR in RevertWhileLoopStartLR!"); + assert((MI->getOpcode() == ARM::t2WhileLoopStartLR || + MI->getOpcode() == ARM::t2WhileLoopStartTP) && + "Only expected a t2WhileLoopStartLR/TP in RevertWhileLoopStartLR!"); // Subs/Cmp if (UseCmp) { @@ -109,8 +125,8 @@ // Branch MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); - MIB.add(MI->getOperand(2)); // branch target - MIB.addImm(ARMCC::EQ); // condition code + MIB.addMBB(getWhileLoopStartTargetBB(*MI)); // branch target + MIB.addImm(ARMCC::EQ); // condition code MIB.addReg(ARM::CPSR); MI->eraseFromParent(); diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll @@ -17,8 +17,7 @@ ; CHECK-NEXT: @ Child Loop BB0_4 Depth 2 ; CHECK-NEXT: adds r4, r1, r7 ; CHECK-NEXT: adds r5, r0, r7 -; CHECK-NEXT: mov r6, r3 -; CHECK-NEXT: wlstp.8 lr, r6, .LBB0_3 +; CHECK-NEXT: wlstp.8 lr, r3, .LBB0_3 ; CHECK-NEXT: b .LBB0_4 ; CHECK-NEXT: .LBB0_3: @ %for.body ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 @@ -71,8 +70,7 @@ ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_4 Depth 2 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: wlstp.8 lr, r3, .LBB1_3 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB1_3 ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_3: @ %for.body ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 @@ -285,8 +283,7 @@ ; CHECK-NEXT: @ %bb.1: @ %prehead ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: wlstp.8 lr, r3, .LBB6_3 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB6_3 ; CHECK-NEXT: .LBB6_2: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vstrb.8 q0, [r12], #16 ; CHECK-NEXT: letp lr, .LBB6_2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir @@ -63,11 +63,11 @@ ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg ; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: [[t2WhileLoopStartLR:%[0-9]+]]:gprlr = t2WhileLoopStartLR killed [[t2LSRri]], %bb.3, implicit-def $cpsr + ; CHECK: [[t2WhileLoopStartTP:%[0-9]+]]:gprlr = t2WhileLoopStartTP killed [[t2LSRri]], [[COPY]], %bb.3, implicit-def $cpsr ; CHECK: bb.2: ; CHECK: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %11, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopStartLR]], %bb.1, %13, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopStartTP]], %bb.1, %13, %bb.2 ; CHECK: [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %15, %bb.2 ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -634,8 +634,7 @@ ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: add.w r3, r0, r5, lsl #1 -; CHECK-NEXT: mov r5, r6 -; CHECK-NEXT: wlstp.8 lr, r5, .LBB10_4 +; CHECK-NEXT: wlstp.8 lr, r6, .LBB10_4 ; CHECK-NEXT: b .LBB10_15 ; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll --- a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll @@ -235,8 +235,7 @@ ; CHECK-NEXT: .LBB10_1: @ %prehead ; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: wlstp.8 lr, r3, .LBB10_3 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB10_3 ; CHECK-NEXT: .LBB10_2: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q0, [r12], #16 ; CHECK-NEXT: vstrb.8 q0, [r4], #16 @@ -318,8 +317,7 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: wlstp.8 lr, r1, .LBB13_2 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB13_2 ; CHECK-NEXT: .LBB13_1: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vstrb.8 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB13_1 @@ -489,8 +487,7 @@ ; CHECK-NEXT: movt r3, :upper16:arr_56 ; CHECK-NEXT: lsr.w r12, r1, #4 ; CHECK-NEXT: mov r2, r3 -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: wlstp.8 lr, r1, .LBB18_5 +; CHECK-NEXT: wlstp.8 lr, r0, .LBB18_5 ; CHECK-NEXT: .LBB18_4: @ Parent Loop BB18_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 @@ -498,8 +495,7 @@ ; CHECK-NEXT: .LBB18_5: @ %loop ; CHECK-NEXT: @ in Loop: Header=BB18_3 Depth=1 ; CHECK-NEXT: mov r2, r3 -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: wlstp.8 lr, r1, .LBB18_7 +; CHECK-NEXT: wlstp.8 lr, r0, .LBB18_7 ; CHECK-NEXT: .LBB18_6: @ Parent Loop BB18_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 @@ -507,8 +503,7 @@ ; CHECK-NEXT: .LBB18_7: @ %loop ; CHECK-NEXT: @ in Loop: Header=BB18_3 Depth=1 ; CHECK-NEXT: mov r2, r3 -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: wlstp.8 lr, r1, .LBB18_9 +; CHECK-NEXT: wlstp.8 lr, r0, .LBB18_9 ; CHECK-NEXT: .LBB18_8: @ Parent Loop BB18_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 @@ -567,12 +562,10 @@ ; CHECK-NEXT: movw r0, :lower16:arr_22 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: movt r0, :upper16:arr_22 -; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: add.w r1, r2, #15 ; CHECK-NEXT: lsrs r3, r1, #4 -; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-NEXT: wlstp.8 lr, r1, .LBB19_2 +; CHECK-NEXT: strd r3, r2, [sp] @ 8-byte Folded Spill +; CHECK-NEXT: wlstp.8 lr, r2, .LBB19_2 ; CHECK-NEXT: .LBB19_1: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vstrb.8 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB19_1 @@ -621,11 +614,12 @@ ; CHECK-NEXT: le lr, .LBB19_3 ; CHECK-NEXT: @ %bb.4: @ %for.cond.cleanup6 ; CHECK-NEXT: movw r0, :lower16:arr_22 -; CHECK-NEXT: ldrd r2, r1, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: movt r0, :upper16:arr_22 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload ; CHECK-NEXT: add.w r0, r0, #1824 -; CHECK-NEXT: wlstp.8 lr, r1, .LBB19_6 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB19_6 ; CHECK-NEXT: .LBB19_5: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vstrb.8 q1, [r0], #16 ; CHECK-NEXT: letp lr, .LBB19_5 @@ -675,11 +669,12 @@ ; CHECK-NEXT: le lr, .LBB19_7 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup6.1 ; CHECK-NEXT: movw r0, :lower16:arr_22 -; CHECK-NEXT: ldrd r2, r1, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: movt r0, :upper16:arr_22 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload ; CHECK-NEXT: add.w r0, r0, #3648 -; CHECK-NEXT: wlstp.8 lr, r1, .LBB19_10 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB19_10 ; CHECK-NEXT: .LBB19_9: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vstrb.8 q1, [r0], #16 ; CHECK-NEXT: letp lr, .LBB19_9 @@ -731,19 +726,14 @@ ; CHECK-NEXT: le lr, .LBB19_11 ; CHECK-NEXT: @ %bb.12: @ %for.cond.cleanup6.2 ; CHECK-NEXT: movw r0, :lower16:arr_22 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: ldrd r2, r1, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: movt r0, :upper16:arr_22 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: add.w r0, r0, #5472 -; CHECK-NEXT: wls lr, r1, .LBB19_14 +; CHECK-NEXT: wlstp.8 lr, r1, .LBB19_14 ; CHECK-NEXT: .LBB19_13: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vctp.8 r1 -; CHECK-NEXT: subs r1, #16 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrbt.8 q1, [r0], #16 -; CHECK-NEXT: le lr, .LBB19_13 +; CHECK-NEXT: vstrb.8 q1, [r0], #16 +; CHECK-NEXT: letp lr, .LBB19_13 ; CHECK-NEXT: .LBB19_14: @ %for.cond.cleanup6.2 ; CHECK-NEXT: movw r2, :lower16:arr_21 ; CHECK-NEXT: movw r1, #5508