Index: llvm/include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -348,6 +348,12 @@ unsigned &Size, unsigned &Offset, const MachineFunction &MF) const; + /// Return true if the given instruction is terminator that is unspillable, + /// according to isUnspillableTerminatorImpl. + bool isUnspillableTerminator(const MachineInstr *MI) const { + return MI->isTerminator() && isUnspillableTerminatorImpl(MI); + } + /// Returns the size in bytes of the specified MachineInstr, or ~0U /// when this function is not implemented by a target. virtual unsigned getInstSizeInBytes(const MachineInstr &MI) const { @@ -954,6 +960,17 @@ return None; } + /// Return true if the given terminator MI is not expected to spill. This + /// sets the live interval as not spillable and adjusts phi node lowering to + /// not introduce copies after the terminator. Use with care, these are + /// currently used for hardware loop intrinsics in very controlled situations, + /// created prior to registry allocation in loops that only have single phi + /// users for the terminators value. They may run out of registers if not used + /// carefully. + virtual bool isUnspillableTerminatorImpl(const MachineInstr *MI) const { + return false; + } + public: /// If the specific machine instruction is a instruction that moves/copies /// value from one register to another register return destination and source Index: llvm/lib/CodeGen/CalcSpillWeights.cpp =================================================================== --- llvm/lib/CodeGen/CalcSpillWeights.cpp +++ llvm/lib/CodeGen/CalcSpillWeights.cpp @@ -142,6 +142,7 @@ SlotIndex *End) { MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineBasicBlock *MBB = nullptr; MachineLoop *Loop = nullptr; bool IsExiting = false; @@ -221,6 +222,13 @@ if (!Visited.insert(MI).second) continue; + // For terminators that produce values, ask the backend if the register is + // not spillable. + if (TII.isUnspillableTerminator(MI) && MI->definesRegister(LI.reg())) { + LI.markNotSpillable(); + return -1.0f; + } + float Weight = 1.0f; if (IsSpillable) { // Get loop info for mi. Index: llvm/lib/CodeGen/MachineVerifier.cpp =================================================================== --- llvm/lib/CodeGen/MachineVerifier.cpp +++ llvm/lib/CodeGen/MachineVerifier.cpp @@ -1550,6 +1550,16 @@ if (MI->isInlineAsm()) verifyInlineAsm(MI); + // Check that unspillable terminators define a reg and have at most one use. + if (TII->isUnspillableTerminator(MI)) { + if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef()) + report("Unspillable Terminator does not define a reg", MI); + Register Def = MI->getOperand(0).getReg(); + if (Def.isVirtual() && + std::distance(MRI->use_nodbg_begin(Def), MRI->use_nodbg_end()) > 1) + report("Unspillable Terminator expected to have at most one use!", MI); + } + // A fully-formed DBG_VALUE must have a location. Ignore partially formed // DBG_VALUEs: these are convenient to use in tests, but should never get // generated. Index: llvm/lib/CodeGen/PHIElimination.cpp =================================================================== --- llvm/lib/CodeGen/PHIElimination.cpp +++ llvm/lib/CodeGen/PHIElimination.cpp @@ -442,6 +442,19 @@ if (!MBBsInsertedInto.insert(&opBlock).second) continue; // If the copy has already been emitted, we're done. + MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg); + if (SrcRegDef && TII->isUnspillableTerminator(SrcRegDef)) { + assert(SrcRegDef->getOperand(0).isReg() && + SrcRegDef->getOperand(0).isDef() && + "Expected operand 0 to be a reg def!"); + // Now that the PHI's use has been removed (as the instruction was + // removed) there should be no other uses of the SrcReg. + assert(MRI->use_empty(SrcReg) && + "Expected a single use from UnspillableTerminator"); + SrcRegDef->getOperand(0).setReg(IncomingReg); + continue; + } + // Find a safe location to insert the copy, this may be the first terminator // in the block (or end()). MachineBasicBlock::iterator InsertPos = Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -360,6 +360,10 @@ /// Enable outlining by default at -Oz. bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; + bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override { + return MI->getOpcode() == ARM::t2LoopEndDec; + } + private: /// Returns an unused general-purpose register which can be used for /// constructing an outlined call if one exists. Returns 0 otherwise. Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -5951,7 +5951,8 @@ // Be conservative with ARMv8.1 MVE instructions. if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart || Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart || - Opc == ARM::t2LoopDec || Opc == ARM::t2LoopEnd) + Opc == ARM::t2LoopDec || Opc == ARM::t2LoopEnd || + Opc == ARM::t2LoopEndDec) return outliner::InstrType::Illegal; const MCInstrDesc &MCID = MI.getDesc(); Index: llvm/lib/Target/ARM/ARMInstrThumb2.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrThumb2.td +++ llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -5448,6 +5448,10 @@ t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target), 8, IIC_Br, []>, Sched<[WriteBr]>; +def t2LoopEndDec : + t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$elts, brtarget:$target), + 8, IIC_Br, []>, Sched<[WriteBr]>; + } // end isBranch, isTerminator, hasSideEffects } Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -507,6 +507,8 @@ void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; + void RevertLoopEndDec(MachineInstr *MI) const; + void ConvertVPTBlocks(LowOverheadLoop &LoLoop); MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); @@ -1023,12 +1025,12 @@ // can only jump back. auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End, ARMBasicBlockUtils *BBUtils, MachineLoop &ML) { - assert(End->getOperand(1).isMBB() && - "Expected LoopEnd to target basic block!"); - + MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd + ? End->getOperand(1).getMBB() + : End->getOperand(2).getMBB(); // TODO Maybe there's cases where the target doesn't have to be the header, // but for now be safe and revert. - if (End->getOperand(1).getMBB() != ML.getHeader()) { + if (TgtBB != ML.getHeader()) { LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n"); return false; } @@ -1270,6 +1272,8 @@ LoLoop.Dec = &MI; else if (MI.getOpcode() == ARM::t2LoopEnd) LoLoop.End = &MI; + else if (MI.getOpcode() == ARM::t2LoopEndDec) + LoLoop.End = LoLoop.Dec = &MI; else if (isLoopStart(MI)) LoLoop.Start = &MI; else if (MI.getDesc().isCall()) { @@ -1292,13 +1296,16 @@ return false; } - // Check that the only instruction using LoopDec is LoopEnd. + // Check that the only instruction using LoopDec is LoopEnd. This can only + // happen when the Dec and End are separate, not a single t2LoopEndDec. // TODO: Check for copy chains that really have no effect. - SmallPtrSet Uses; - RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses); - if (Uses.size() > 1 || !Uses.count(LoLoop.End)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n"); - LoLoop.Revert = true; + if (LoLoop.Dec != LoLoop.End) { + SmallPtrSet Uses; + RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses); + if (Uses.size() > 1 || !Uses.count(LoLoop.End)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n"); + LoLoop.Revert = true; + } } LoLoop.Validate(BBUtils.get()); Expand(LoLoop); @@ -1353,6 +1360,35 @@ llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp); } +// Generate a subs, or sub and cmp, and a branch instead of an LE. +void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const { + LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI); + assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!"); + MachineBasicBlock *MBB = MI->getParent(); + + MachineInstrBuilder MIB = + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri)); + MIB.addDef(ARM::LR); + MIB.add(MI->getOperand(1)); + MIB.addImm(1); + MIB.addImm(ARMCC::AL); + MIB.addReg(ARM::NoRegister); + MIB.addReg(ARM::CPSR); + MIB->getOperand(5).setIsDef(true); + + MachineBasicBlock *DestBB = MI->getOperand(2).getMBB(); + unsigned BrOpc = + BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc; + + // Create bne + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); + MIB.add(MI->getOperand(2)); // branch target + MIB.addImm(ARMCC::NE); // condition code + MIB.addReg(ARM::CPSR); + + MI->eraseFromParent(); +} + // Perform dead code elimation on the loop iteration count setup expression. // If we are tail-predicating, the number of elements to be processed is the // operand of the VCTP instruction in the vector body, see getCount(), which is @@ -1558,8 +1594,9 @@ MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), TII->get(Opc)); MIB.addDef(ARM::LR); - MIB.add(End->getOperand(0)); - MIB.add(End->getOperand(1)); + unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0; + MIB.add(End->getOperand(Off + 0)); + MIB.add(End->getOperand(Off + 1)); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); LoLoop.ToRemove.insert(LoLoop.Dec); LoLoop.ToRemove.insert(End); @@ -1588,8 +1625,10 @@ RevertWhile(LoLoop.Start); else RevertDo(LoLoop.Start); - bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec); - RevertLoopEnd(LoLoop.End, FlagsAlreadySet); + if (LoLoop.Dec == LoLoop.End) + RevertLoopEndDec(LoLoop.End); + else + RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec)); } else { LoLoop.Start = ExpandLoopStart(LoLoop); RemoveDeadBranch(LoLoop.Start); @@ -1633,6 +1672,7 @@ SmallVector Starts; SmallVector Decs; SmallVector Ends; + SmallVector EndDecs; for (auto &I : MBB) { if (isLoopStart(I)) @@ -1641,9 +1681,11 @@ Decs.push_back(&I); else if (I.getOpcode() == ARM::t2LoopEnd) Ends.push_back(&I); + else if (I.getOpcode() == ARM::t2LoopEndDec) + EndDecs.push_back(&I); } - if (Starts.empty() && Decs.empty() && Ends.empty()) + if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty()) continue; Changed = true; @@ -1659,6 +1701,8 @@ for (auto *End : Ends) RevertLoopEnd(End); + for (auto *End : EndDecs) + RevertLoopEndDec(End); } return Changed; } Index: llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp =================================================================== --- llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -35,6 +35,11 @@ #define DEBUG_TYPE "arm-mve-vpt-opts" +static cl::opt +MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden, + cl::desc("Enable merging Loop End and Dec instructions."), + cl::init(true)); + namespace { class MVETPAndVPTOptimisations : public MachineFunctionPass { public: @@ -110,6 +115,11 @@ LoopEnd = &T; break; } + if (T.getOpcode() == ARM::t2LoopEndDec && + T.getOperand(2).getMBB() == Header) { + LoopEnd = &T; + break; + } } if (!LoopEnd) { LLVM_DEBUG(dbgs() << " no LoopEnd\n"); @@ -126,11 +136,15 @@ // $vd = t2LoopDec $vp // ... // t2LoopEnd $vd, loop - LoopDec = - LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI); - if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) { - LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n"); - return false; + if (LoopEnd->getOpcode() == ARM::t2LoopEndDec) + LoopDec = LoopEnd; + else { + LoopDec = + LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI); + if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) { + LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n"); + return false; + } } LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec); @@ -166,6 +180,9 @@ // decrement) around the loop edge, which means we need to be careful that they // will be valid to allocate without any spilling. bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) { + if (!MergeEndDec) + return false; + LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName() << "\n"); @@ -233,9 +250,16 @@ LoopPhi->getOperand(3).setReg(DecReg); } - LoopDec->getOperand(1).setReg(PhiReg); - LoopEnd->getOperand(0).setReg(DecReg); + // Replace the loop dec and loop end as a single instruction. + MachineInstrBuilder MI = + BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(), + TII->get(ARM::t2LoopEndDec), DecReg) + .addReg(PhiReg) + .add(LoopEnd->getOperand(1)); + LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr()); + LoopDec->eraseFromParent(); + LoopEnd->eraseFromParent(); for (auto *MI : Copies) MI->eraseFromParent(); return true; @@ -255,6 +279,8 @@ MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) return false; + if (LoopDec != LoopEnd) + return false; SmallVector VCTPs; for (MachineBasicBlock *BB : ML->blocks()) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir @@ -143,8 +143,7 @@ ; CHECK: [[COPY8:%[0-9]+]]:gpr = COPY [[MVE_VMLADAVas16_]] ; CHECK: [[COPY9:%[0-9]+]]:gpr = COPY [[MVE_VLDRHU16_post2]] ; CHECK: [[COPY10:%[0-9]+]]:gpr = COPY [[MVE_VLDRHU16_post]] - ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI3]], 1 - ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def dead $cpsr + ; CHECK: [[t2LoopEndDec:%[0-9]+]]:gprlr = t2LoopEndDec [[PHI3]], %bb.3, implicit-def $cpsr ; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg ; CHECK: bb.4.for.cond.cleanup: ; CHECK: [[PHI5:%[0-9]+]]:gpr = PHI [[COPY3]], %bb.1, [[COPY8]], %bb.3 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll @@ -6,88 +6,78 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: ldr.w r12, [r0] ; CHECK-NEXT: subs.w r9, r1, #1 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: subs r6, r1, #2 -; CHECK-NEXT: and r7, r9, #3 -; CHECK-NEXT: cmp r6, #3 -; CHECK-NEXT: str r7, [sp] @ 4-byte Spill +; CHECK-NEXT: subs r7, r1, #2 +; CHECK-NEXT: and r8, r9, #3 +; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB0_4 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: b .LBB0_6 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .LBB0_4: @ %while.body.preheader.new -; CHECK-NEXT: bic r6, r9, #3 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: subs r6, #4 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: add.w lr, r4, r6, lsr #2 -; CHECK-NEXT: movs r6, #4 -; CHECK-NEXT: mov lr, lr -; CHECK-NEXT: mov r11, lr +; CHECK-NEXT: bic r7, r9, #3 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: movs r7, #4 ; CHECK-NEXT: .LBB0_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r10, [r0, #16]! -; CHECK-NEXT: mov lr, r11 -; CHECK-NEXT: sub.w lr, lr, #1 ; CHECK-NEXT: sub.w r9, r9, #4 -; CHECK-NEXT: ldrd r7, r5, [r0, #-12] -; CHECK-NEXT: mov r11, lr -; CHECK-NEXT: ldr r4, [r0, #-4] -; CHECK-NEXT: cmp r12, r7 +; CHECK-NEXT: ldrd r5, r4, [r0, #-12] +; CHECK-NEXT: ldr r11, [r0, #-4] +; CHECK-NEXT: cmp r12, r5 ; CHECK-NEXT: it gt -; CHECK-NEXT: subgt.w r8, r6, #3 -; CHECK-NEXT: csel r7, r7, r12, gt -; CHECK-NEXT: cmp r7, r5 +; CHECK-NEXT: subgt r6, r7, #3 +; CHECK-NEXT: csel r5, r5, r12, gt +; CHECK-NEXT: cmp r5, r4 ; CHECK-NEXT: it gt -; CHECK-NEXT: subgt.w r8, r6, #2 -; CHECK-NEXT: csel r7, r5, r7, gt -; CHECK-NEXT: cmp r7, r4 +; CHECK-NEXT: subgt r6, r7, #2 +; CHECK-NEXT: csel r5, r4, r5, gt +; CHECK-NEXT: cmp r5, r11 ; CHECK-NEXT: it gt -; CHECK-NEXT: subgt.w r8, r6, #1 -; CHECK-NEXT: csel r7, r4, r7, gt -; CHECK-NEXT: cmp r7, r10 -; CHECK-NEXT: csel r8, r6, r8, gt -; CHECK-NEXT: add.w r6, r6, #4 -; CHECK-NEXT: csel r12, r10, r7, gt -; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: bne .LBB0_5 -; CHECK-NEXT: b .LBB0_6 +; CHECK-NEXT: subgt r6, r7, #1 +; CHECK-NEXT: csel r5, r11, r5, gt +; CHECK-NEXT: cmp r5, r10 +; CHECK-NEXT: csel r6, r7, r6, gt +; CHECK-NEXT: add.w r7, r7, #4 +; CHECK-NEXT: csel r12, r10, r5, gt +; CHECK-NEXT: le lr, .LBB0_5 ; CHECK-NEXT: .LBB0_6: @ %while.end.loopexit.unr-lcssa -; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload -; CHECK-NEXT: cbz r7, .LBB0_10 +; CHECK-NEXT: cmp.w r8, #0 +; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.7: @ %while.body.epil -; CHECK-NEXT: ldr r4, [r0, #4] +; CHECK-NEXT: ldr r7, [r0, #4] ; CHECK-NEXT: sub.w r1, r1, r9 -; CHECK-NEXT: cmp r12, r4 -; CHECK-NEXT: csel r8, r1, r8, gt -; CHECK-NEXT: csel r12, r4, r12, gt -; CHECK-NEXT: cmp r7, #1 +; CHECK-NEXT: cmp r12, r7 +; CHECK-NEXT: csel r6, r1, r6, gt +; CHECK-NEXT: csel r12, r7, r12, gt +; CHECK-NEXT: cmp.w r8, #1 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.8: @ %while.body.epil.1 -; CHECK-NEXT: ldr r4, [r0, #8] -; CHECK-NEXT: cmp r12, r4 -; CHECK-NEXT: csinc r8, r8, r1, le -; CHECK-NEXT: csel r12, r4, r12, gt -; CHECK-NEXT: cmp r7, #2 +; CHECK-NEXT: ldr r7, [r0, #8] +; CHECK-NEXT: cmp r12, r7 +; CHECK-NEXT: csinc r6, r6, r1, le +; CHECK-NEXT: csel r12, r7, r12, gt +; CHECK-NEXT: cmp.w r8, #2 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.9: @ %while.body.epil.2 ; CHECK-NEXT: ldr r0, [r0, #12] ; CHECK-NEXT: cmp r12, r0 ; CHECK-NEXT: it gt -; CHECK-NEXT: addgt.w r8, r1, #2 +; CHECK-NEXT: addgt r6, r1, #2 ; CHECK-NEXT: csel r12, r0, r12, gt ; CHECK-NEXT: .LBB0_10: @ %while.end ; CHECK-NEXT: str.w r12, [r2] -; CHECK-NEXT: str.w r8, [r3] -; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: str r6, [r3] ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %0 = load i32, i32* %pSrc, align 4 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -876,8 +876,7 @@ ; CHECK-LABEL: float_int_int_mul: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: beq .LBB4_8 +; CHECK-NEXT: cbz r3, .LBB4_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB4_3 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/regalloc.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/regalloc.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/regalloc.ll @@ -47,14 +47,14 @@ ; CHECK-NEXT: addlt r4, r6, #2 ; CHECK-NEXT: sxtb.w r10, r12 ; CHECK-NEXT: cmp r10, r5 -; CHECK-NEXT: csel r10, r5, r12, lt +; CHECK-NEXT: csel r12, r5, r12, lt ; CHECK-NEXT: ldrsb r5, [r7, #-2] ; CHECK-NEXT: it lt ; CHECK-NEXT: addlt r4, r6, #3 ; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: sxtb.w r12, r10 -; CHECK-NEXT: cmp r12, r5 -; CHECK-NEXT: csel r12, r5, r10, lt +; CHECK-NEXT: sxtb.w r10, r12 +; CHECK-NEXT: cmp r10, r5 +; CHECK-NEXT: csel r12, r5, r12, lt ; CHECK-NEXT: csel r10, r6, r4, lt ; CHECK-NEXT: le lr, .LBB0_5 ; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit @@ -213,10 +213,10 @@ ; CHECK-NEXT: bic r4, r2, #7 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r3, r4, #8 -; CHECK-NEXT: add.w r12, r1, r4, lsl #1 +; CHECK-NEXT: add.w r12, r0, r4, lsl #1 ; CHECK-NEXT: add.w lr, r5, r3, lsr #3 ; CHECK-NEXT: and r5, r2, #7 -; CHECK-NEXT: add.w r3, r0, r4, lsl #1 +; CHECK-NEXT: add.w r3, r1, r4, lsl #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -232,19 +232,19 @@ ; CHECK-NEXT: dls lr, r5 ; CHECK-NEXT: .LBB1_7: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr.16 s0, [r3] -; CHECK-NEXT: add.w r0, r12, #2 -; CHECK-NEXT: adds r3, #2 +; CHECK-NEXT: vldr.16 s0, [r12] +; CHECK-NEXT: adds r0, r3, #2 +; CHECK-NEXT: add.w r12, r12, #2 ; CHECK-NEXT: vabs.f16 s0, s0 -; CHECK-NEXT: vstr.16 s0, [r12] -; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: vstr.16 s0, [r3] +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: le lr, .LBB1_7 ; CHECK-NEXT: .LBB1_8: @ %while.end ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB1_9: ; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: mov r3, r1 ; CHECK-NEXT: b .LBB1_6 entry: %cmp4 = icmp eq i32 %blockSize, 0 Index: llvm/test/CodeGen/Thumb2/mve-float32regloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1080,7 +1080,7 @@ ; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: add.w r0, r5, r0, lsl #2 ; CHECK-NEXT: add.w r5, r0, #16 -; CHECK-NEXT: beq.w .LBB16_12 +; CHECK-NEXT: beq .LBB16_12 ; CHECK-NEXT: .LBB16_4: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 Index: llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -1421,12 +1421,11 @@ ; CHECK-NEXT: vldrwt.u32 q0, [r12], #16 ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vfmat.f32 q6, q1, q0 -; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r11] ; CHECK-NEXT: vfmat.f32 q7, q1, q0 -; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: add.w r6, r11, r5 ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vmov q3, q4 @@ -1434,23 +1433,22 @@ ; CHECK-NEXT: vldrwt.u32 q1, [r6] ; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q1, [r7] ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q2, q1, q0 -; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vmov q2, q4 ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov q3, q5 ; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vmov q6, q7 -; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r6] Index: llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -1068,7 +1068,7 @@ ; CHECK-NEXT: vstr s0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: blo.w .LBB7_9 +; CHECK-NEXT: blo .LBB7_9 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: movs r3, #1 Index: llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -20,7 +20,7 @@ ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: mov r11, r2 ; CHECK-NEXT: b .LBB0_6 ; CHECK-NEXT: .LBB0_3: @ %vector.ph @@ -32,38 +32,38 @@ ; CHECK-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-NEXT: add.w lr, r6, r7, lsr #1 ; CHECK-NEXT: add.w r11, r2, r3, lsl #2 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 +; CHECK-NEXT: add.w r6, r1, r3, lsl #2 ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: mvn r10, #-2147483648 ; CHECK-NEXT: vmvn.i32 q1, #0x80000000 -; CHECK-NEXT: mov.w r10, #-1 ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r4, r5, [r0] +; CHECK-NEXT: ldrd r4, r8, [r0] ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: ldrd r7, r6, [r1] +; CHECK-NEXT: ldrd r7, r5, [r1] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: smull r8, r5, r6, r5 +; CHECK-NEXT: smull r8, r5, r5, r8 ; CHECK-NEXT: smull r4, r7, r7, r4 ; CHECK-NEXT: asrl r8, r5, #31 ; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 +; CHECK-NEXT: rsbs.w r9, r4, #-2147483648 ; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: sbcs.w r3, r10, r7 -; CHECK-NEXT: vmov.32 q4[1], r7 +; CHECK-NEXT: mov.w r9, #-1 +; CHECK-NEXT: sbcs.w r3, r9, r7 ; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: vmov.32 q4[2], r8 +; CHECK-NEXT: vmov.32 q4[1], r7 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov.32 q4[3], r5 +; CHECK-NEXT: vmov.32 q4[2], r8 ; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov.32 q4[3], r5 ; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: rsbs.w r3, r8, #-2147483648 -; CHECK-NEXT: sbcs.w r3, r10, r5 -; CHECK-NEXT: mvn r5, #-2147483648 +; CHECK-NEXT: sbcs.w r3, r9, r5 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 @@ -76,7 +76,7 @@ ; CHECK-NEXT: vorr q2, q2, q3 ; CHECK-NEXT: vmov r4, s8 ; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: subs r4, r4, r5 +; CHECK-NEXT: subs.w r4, r4, r10 ; CHECK-NEXT: sbcs r3, r3, #0 ; CHECK-NEXT: vmov r4, s10 ; CHECK-NEXT: mov.w r3, #0 @@ -87,7 +87,7 @@ ; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: subs r4, r4, r5 +; CHECK-NEXT: subs.w r4, r4, r10 ; CHECK-NEXT: sbcs r3, r3, #0 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt @@ -116,7 +116,7 @@ ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r3, [r12], #4 -; CHECK-NEXT: ldr r4, [r9], #4 +; CHECK-NEXT: ldr r4, [r6], #4 ; CHECK-NEXT: smull r4, r3, r4, r3 ; CHECK-NEXT: asrl r4, r3, #31 ; CHECK-NEXT: subs r5, r1, r4 @@ -234,37 +234,35 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB1_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r8, r2 -; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB1_3 ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r10, r1 -; CHECK-NEXT: mov r11, r8 +; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: mov r11, r2 ; CHECK-NEXT: b .LBB1_6 ; CHECK-NEXT: .LBB1_3: @ %vector.ph +; CHECK-NEXT: bic r7, r3, #3 +; CHECK-NEXT: adr r4, .LCPI1_0 +; CHECK-NEXT: subs r1, r7, #4 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r2, r3, #4 -; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r11, r8, r3, lsl #2 -; CHECK-NEXT: add.w lr, r7, r2, lsr #2 -; CHECK-NEXT: adr r7, .LCPI1_0 -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: adr r7, .LCPI1_1 -; CHECK-NEXT: add.w r10, r1, r3, lsl #2 -; CHECK-NEXT: add.w r12, r0, r3, lsl #2 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: add.w lr, r3, r1, lsr #2 +; CHECK-NEXT: str r7, [sp] @ 4-byte Spill +; CHECK-NEXT: adr r4, .LCPI1_1 +; CHECK-NEXT: add.w r11, r2, r7, lsl #2 +; CHECK-NEXT: add.w r1, r9, r7, lsl #2 +; CHECK-NEXT: add.w r12, r0, r7, lsl #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vldrw.u32 q1, [r7] -; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: mvn r9, #-2147483648 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: mov.w r10, #-1 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 -; CHECK-NEXT: mov r2, lr +; CHECK-NEXT: vldrw.u32 q3, [r9], #16 ; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: vmov.f32 s20, s14 ; CHECK-NEXT: vmov.f32 s18, s11 @@ -272,12 +270,12 @@ ; CHECK-NEXT: vmullb.s32 q6, q5, q4 ; CHECK-NEXT: vmov.f32 s10, s9 ; CHECK-NEXT: vmov r7, s25 -; CHECK-NEXT: vmov r6, s24 -; CHECK-NEXT: asrl r6, r7, #31 -; CHECK-NEXT: vmov lr, s26 -; CHECK-NEXT: rsbs.w r5, r6, #-2147483648 +; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: asrl r4, r7, #31 +; CHECK-NEXT: vmov r8, s26 +; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 ; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: sbcs.w r5, r3, r7 +; CHECK-NEXT: sbcs.w r5, r10, r7 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r5, #1 @@ -286,56 +284,56 @@ ; CHECK-NEXT: vmov.32 q4[0], r5 ; CHECK-NEXT: vmov.32 q4[1], r5 ; CHECK-NEXT: vmov r5, s27 -; CHECK-NEXT: asrl lr, r5, #31 -; CHECK-NEXT: vmov.32 q6[0], r6 -; CHECK-NEXT: rsbs.w r4, lr, #-2147483648 +; CHECK-NEXT: asrl r8, r5, #31 +; CHECK-NEXT: vmov.32 q6[0], r4 +; CHECK-NEXT: rsbs.w r6, r8, #-2147483648 ; CHECK-NEXT: vmov.32 q6[1], r7 -; CHECK-NEXT: sbcs.w r4, r3, r5 -; CHECK-NEXT: vmov.32 q6[2], lr -; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: sbcs.w r6, r10, r5 +; CHECK-NEXT: vmov.32 q6[2], r8 +; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: vmov.32 q6[3], r5 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: mov lr, r2 -; CHECK-NEXT: vmov.32 q4[2], r4 -; CHECK-NEXT: vmov.32 q4[3], r4 -; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: csetm r6, ne +; CHECK-NEXT: mvn r8, #-2147483648 +; CHECK-NEXT: vmov.32 q4[2], r6 +; CHECK-NEXT: vmov.32 q4[3], r6 +; CHECK-NEXT: vmov r6, s14 ; CHECK-NEXT: vbic q5, q0, q4 ; CHECK-NEXT: vand q4, q6, q4 ; CHECK-NEXT: vorr q4, q4, q5 -; CHECK-NEXT: vmov r6, s16 -; CHECK-NEXT: vmov r7, s17 -; CHECK-NEXT: subs.w r6, r6, r9 -; CHECK-NEXT: sbcs r7, r7, #0 -; CHECK-NEXT: vmov r6, s18 -; CHECK-NEXT: mov.w r7, #0 +; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: subs.w r5, r5, r8 +; CHECK-NEXT: sbcs r4, r4, #0 +; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csetm r7, ne -; CHECK-NEXT: vmov.32 q5[0], r7 -; CHECK-NEXT: vmov.32 q5[1], r7 -; CHECK-NEXT: vmov r7, s19 -; CHECK-NEXT: subs.w r6, r6, r9 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: sbcs r7, r7, #0 -; CHECK-NEXT: mov.w r7, #0 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csetm r4, ne +; CHECK-NEXT: vmov.32 q5[0], r4 +; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: subs.w r5, r5, r8 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: sbcs r4, r4, #0 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csetm r7, ne -; CHECK-NEXT: vmov.32 q5[2], r7 -; CHECK-NEXT: vmov r7, s8 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csetm r4, ne +; CHECK-NEXT: vmov.32 q5[2], r4 +; CHECK-NEXT: vmov r4, s8 ; CHECK-NEXT: vbic q6, q1, q5 ; CHECK-NEXT: vand q4, q4, q5 ; CHECK-NEXT: vorr q4, q4, q6 -; CHECK-NEXT: smull r6, r7, r6, r7 -; CHECK-NEXT: asrl r6, r7, #31 -; CHECK-NEXT: rsbs.w r5, r6, #-2147483648 -; CHECK-NEXT: vmov.32 q3[0], r6 -; CHECK-NEXT: sbcs.w r5, r3, r7 +; CHECK-NEXT: smull r4, r7, r5, r4 +; CHECK-NEXT: asrl r4, r7, #31 +; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 +; CHECK-NEXT: vmov.32 q3[0], r4 +; CHECK-NEXT: sbcs.w r5, r10, r7 ; CHECK-NEXT: vmov.32 q3[1], r7 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lt @@ -345,79 +343,79 @@ ; CHECK-NEXT: vmov.32 q5[0], r5 ; CHECK-NEXT: vmov.32 q5[1], r5 ; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: smull r4, r5, r4, r5 -; CHECK-NEXT: asrl r4, r5, #31 -; CHECK-NEXT: rsbs.w r2, r4, #-2147483648 -; CHECK-NEXT: vmov.32 q3[2], r4 -; CHECK-NEXT: sbcs.w r2, r3, r5 +; CHECK-NEXT: smull r6, r5, r6, r5 +; CHECK-NEXT: asrl r6, r5, #31 +; CHECK-NEXT: rsbs.w r3, r6, #-2147483648 +; CHECK-NEXT: vmov.32 q3[2], r6 +; CHECK-NEXT: sbcs.w r3, r10, r5 ; CHECK-NEXT: vmov.32 q3[3], r5 -; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r2 +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q5[2], r3 +; CHECK-NEXT: vmov.32 q5[3], r3 ; CHECK-NEXT: vbic q2, q0, q5 ; CHECK-NEXT: vand q3, q3, q5 ; CHECK-NEXT: vorr q2, q3, q2 -; CHECK-NEXT: vmov r7, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: subs.w r7, r7, r9 -; CHECK-NEXT: sbcs r2, r2, #0 -; CHECK-NEXT: vmov r7, s10 -; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.32 q3[1], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: subs.w r7, r7, r9 -; CHECK-NEXT: sbcs r2, r2, #0 -; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vbic q5, q1, q3 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vorr q2, q2, q5 ; CHECK-NEXT: vmov.f32 s9, s10 ; CHECK-NEXT: vmov.f32 s10, s16 ; CHECK-NEXT: vmov.f32 s11, s18 -; CHECK-NEXT: vstrb.8 q2, [r8], #16 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: ldrd r2, r3, [sp] @ 8-byte Folded Reload -; CHECK-NEXT: cmp r2, r3 +; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: beq .LBB1_8 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r2 +; CHECK-NEXT: sub.w lr, r3, r7 ; CHECK-NEXT: mov.w r0, #-1 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: mov.w r1, #-2147483648 -; CHECK-NEXT: mvn r3, #-2147483648 +; CHECK-NEXT: mov.w r3, #-2147483648 +; CHECK-NEXT: mvn r2, #-2147483648 ; CHECK-NEXT: .LBB1_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r2, [r12], #4 -; CHECK-NEXT: ldr r4, [r10], #4 -; CHECK-NEXT: smull r2, r5, r4, r2 -; CHECK-NEXT: asrl r2, r5, #31 -; CHECK-NEXT: subs r4, r1, r2 -; CHECK-NEXT: sbcs.w r4, r0, r5 -; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: ldr r4, [r12], #4 +; CHECK-NEXT: ldr r5, [r1], #4 +; CHECK-NEXT: smull r4, r5, r5, r4 +; CHECK-NEXT: asrl r4, r5, #31 +; CHECK-NEXT: subs r6, r3, r4 +; CHECK-NEXT: sbcs.w r6, r0, r5 +; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r2, r2, r1, ne -; CHECK-NEXT: csel r4, r5, r0, ne -; CHECK-NEXT: subs r5, r2, r3 -; CHECK-NEXT: sbcs r4, r4, #0 -; CHECK-NEXT: csel r2, r2, r3, lt -; CHECK-NEXT: str r2, [r11], #4 +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: csel r4, r4, r3, ne +; CHECK-NEXT: csel r5, r5, r0, ne +; CHECK-NEXT: subs r6, r4, r2 +; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: csel r4, r4, r2, lt +; CHECK-NEXT: str r4, [r11], #4 ; CHECK-NEXT: le lr, .LBB1_7 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #8 @@ -1366,8 +1364,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: beq .LBB7_8 +; CHECK-NEXT: cbz r3, .LBB7_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #7 ; CHECK-NEXT: bhi .LBB7_3 @@ -2563,8 +2560,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: beq .LBB16_8 +; CHECK-NEXT: cbz r3, .LBB16_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #15 ; CHECK-NEXT: bhi .LBB16_3 @@ -3339,8 +3335,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: beq .LBB20_8 +; CHECK-NEXT: cbz r3, .LBB20_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #7 ; CHECK-NEXT: bhi .LBB20_3 Index: llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll +++ llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll @@ -137,8 +137,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: beq .LBB1_8 +; CHECK-NEXT: cbz r2, .LBB1_8 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: cmp r2, #4 ; CHECK-NEXT: blo .LBB1_9