diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15502,6 +15502,45 @@ hardware-loop count with a target specific instruction, usually a move of this value to a special register or a hardware-loop instruction. + +'``llvm.start.loop.iterations.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.start.loop.iterations.i32(i32) + declare i64 @llvm.start.loop.iterations.i64(i64) + +Overview: +""""""""" + +The '``llvm.start.loop.iterations.*``' intrinsics are similar to the +'``llvm.set.loop.iterations.*``' intrinsics, used to specify the +hardware-loop trip count but also produce a value identical to the input +that can be used as the input to the loop. They are placed in the loop +preheader basic block and the output is expected to be the input to the +phi for the induction variable of the loop, decremented by the +'``llvm.loop.decrement.reg.*``'. + +Arguments: +"""""""""" + +The integer operand is the loop trip count of the hardware-loop, and thus +not e.g. the loop back-edge taken count. + +Semantics: +"""""""""" + +The '``llvm.start.loop.iterations.*``' intrinsics do not perform any arithmetic +on their operand. It's a hint to the backend that can use this to set up the +hardware-loop count with a target specific instruction, usually a move of this +value to a special register or a hardware-loop instruction. + '``llvm.test.set.loop.iterations.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1576,6 +1576,11 @@ def int_set_loop_iterations : DefaultAttrsIntrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>; +// Same as the above, but produces a value (the same as the input operand) to +// be fed into the loop. +def int_start_loop_iterations : + DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoDuplicate]>; + // Specify that the value given is the number of iterations that the next loop // will execute. Also test that the given count is not zero, allowing it to // control entry to a 'while' loop. diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -6672,6 +6672,10 @@ const SCEV *ClampedX = getUMinExpr(X, getNotSCEV(Y)); return getAddExpr(ClampedX, Y, SCEV::FlagNUW); } + case Intrinsic::start_loop_iterations: + // A start_loop_iterations is just equivalent to the first operand for + // SCEV purposes. + return getSCEV(II->getArgOperand(0)); default: break; } diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp --- a/llvm/lib/CodeGen/HardwareLoops.cpp +++ b/llvm/lib/CodeGen/HardwareLoops.cpp @@ -165,7 +165,7 @@ Value *InitLoopCount(); // Insert the set_loop_iteration intrinsic. - void InsertIterationSetup(Value *LoopCountInit); + Value *InsertIterationSetup(Value *LoopCountInit); // Insert the loop_decrement intrinsic. void InsertLoopDec(); @@ -325,11 +325,11 @@ return; } - InsertIterationSetup(LoopCountInit); + Value *Setup = InsertIterationSetup(LoopCountInit); if (UsePHICounter || ForceHardwareLoopPHI) { Instruction *LoopDec = InsertLoopRegDec(LoopCountInit); - Value *EltsRem = InsertPHICounter(LoopCountInit, LoopDec); + Value *EltsRem = InsertPHICounter(Setup, LoopDec); LoopDec->setOperand(0, EltsRem); UpdateBranch(LoopDec); } else @@ -437,11 +437,13 @@ return Count; } -void HardwareLoop::InsertIterationSetup(Value *LoopCountInit) { +Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) { IRBuilder<> Builder(BeginBB->getTerminator()); Type *Ty = LoopCountInit->getType(); - Intrinsic::ID ID = UseLoopGuard ? - Intrinsic::test_set_loop_iterations : Intrinsic::set_loop_iterations; + bool UsePhi = UsePHICounter || ForceHardwareLoopPHI; + Intrinsic::ID ID = UseLoopGuard ? Intrinsic::test_set_loop_iterations + : (UsePhi ? Intrinsic::start_loop_iterations + : Intrinsic::set_loop_iterations); Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty); Value *SetCount = Builder.CreateCall(LoopIter, LoopCountInit); @@ -457,6 +459,7 @@ } LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: " << *SetCount << "\n"); + return UseLoopGuard ? LoopCountInit : SetCount; } void HardwareLoop::InsertLoopDec() { diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -5420,9 +5420,11 @@ let isTerminator = 1; } +let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in { + def t2DoLoopStart : - t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br, - [(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>; + t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br, + [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>; let hasSideEffects = 0 in def t2LoopDec : @@ -5444,6 +5446,8 @@ } // end isBranch, isTerminator, hasSideEffects +} + } // end isNotDuplicable class CS opcode, list pattern=[]> diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -429,7 +429,10 @@ // Return the operand for the loop start instruction. This will be the loop // iteration count, or the number of elements if we're tail predicating. MachineOperand &getLoopStartOperand() { - return IsTailPredicationLegal() ? TPNumElements : Start->getOperand(0); + if (IsTailPredicationLegal()) + return TPNumElements; + return Start->getOpcode() == ARM::t2DoLoopStart ? Start->getOperand(1) + : Start->getOperand(0); } unsigned getStartOpcode() const { @@ -495,6 +498,7 @@ bool RevertNonLoops(); void RevertWhile(MachineInstr *MI) const; + void RevertDo(MachineInstr *MI) const; bool RevertLoopDec(MachineInstr *MI) const; @@ -618,8 +622,12 @@ // count instead of iteration count, won't affect any other instructions // than the LoopStart and LoopDec. // TODO: We should try to insert the [W|D]LSTP after any of the other uses. - if (StartInsertPt == Start && Start->getOperand(0).getReg() == ARM::LR) { - if (auto *IterCount = RDA.getMIOperand(Start, 0)) { + Register StartReg = Start->getOpcode() == ARM::t2DoLoopStart + ? Start->getOperand(1).getReg() + : Start->getOperand(0).getReg(); + if (StartInsertPt == Start && StartReg == ARM::LR) { + if (auto *IterCount = RDA.getMIOperand( + Start, Start->getOpcode() == ARM::t2DoLoopStart ? 1 : 0)) { SmallPtrSet Uses; RDA.getGlobalUses(IterCount, MCRegister::from(ARM::LR), Uses); for (auto *Use : Uses) { @@ -1053,53 +1061,15 @@ MachineBasicBlock *&InsertBB, ReachingDefAnalysis &RDA, InstSet &ToRemove) { - // We can define LR because LR already contains the same value. - if (Start->getOperand(0).getReg() == ARM::LR) { + // For a t2DoLoopStart it is always valid to use the start insertion point. + // For WLS we can define LR if LR already contains the same value. + if (Start->getOpcode() == ARM::t2DoLoopStart || + Start->getOperand(0).getReg() == ARM::LR) { InsertPt = MachineBasicBlock::iterator(Start); InsertBB = Start->getParent(); return true; } - Register CountReg = Start->getOperand(0).getReg(); - auto IsMoveLR = [&CountReg](MachineInstr *MI) { - return MI->getOpcode() == ARM::tMOVr && - MI->getOperand(0).getReg() == ARM::LR && - MI->getOperand(1).getReg() == CountReg && - MI->getOperand(2).getImm() == ARMCC::AL; - }; - - // Find an insertion point: - // - Is there a (mov lr, Count) before Start? If so, and nothing else - // writes to Count before Start, we can insert at start. - if (auto *LRDef = - RDA.getUniqueReachingMIDef(Start, MCRegister::from(ARM::LR))) { - if (IsMoveLR(LRDef) && - RDA.hasSameReachingDef(Start, LRDef, CountReg.asMCReg())) { - SmallPtrSet Ignore = { Dec }; - if (!TryRemove(LRDef, RDA, ToRemove, Ignore)) - return false; - InsertPt = MachineBasicBlock::iterator(Start); - InsertBB = Start->getParent(); - return true; - } - } - - // - Is there a (mov lr, Count) after Start? If so, and nothing else writes - // to Count after Start, we can insert at that mov (which will now be - // dead). - MachineBasicBlock *MBB = Start->getParent(); - if (auto *LRDef = - RDA.getLocalLiveOutMIDef(MBB, MCRegister::from(ARM::LR))) { - if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) { - SmallPtrSet Ignore = { Start, Dec }; - if (!TryRemove(LRDef, RDA, ToRemove, Ignore)) - return false; - InsertPt = MachineBasicBlock::iterator(LRDef); - InsertBB = LRDef->getParent(); - return true; - } - } - // We've found no suitable LR def and Start doesn't use LR directly. Can we // just define LR anyway? if (!RDA.isSafeToDefRegAt(Start, MCRegister::from(ARM::LR))) @@ -1364,6 +1334,16 @@ MI->eraseFromParent(); } +void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const { + LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI); + MachineBasicBlock *MBB = MI->getParent(); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::tMOVr)) + .add(MI->getOperand(0)) + .add(MI->getOperand(1)) + .add(predOps(ARMCC::AL)); + MI->eraseFromParent(); +} + bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); MachineBasicBlock *MBB = MI->getParent(); @@ -1432,7 +1412,7 @@ // // $lr = big-itercount-expression // .. -// t2DoLoopStart renamable $lr +// $lr = t2DoLoopStart renamable $lr // vector.body: // .. // $vpr = MVE_VCTP32 renamable $r3 @@ -1455,7 +1435,8 @@ LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n"); - MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 0); + MachineInstr *Def = RDA->getMIOperand( + LoLoop.Start, LoLoop.Start->getOpcode() == ARM::t2DoLoopStart ? 1 : 0); if (!Def) { LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n"); return; @@ -1634,7 +1615,7 @@ if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart) RevertWhile(LoLoop.Start); else - LoLoop.Start->eraseFromParent(); + RevertDo(LoLoop.Start); bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec); RevertLoopEnd(LoLoop.End, FlagsAlreadySet); } else { @@ -1699,7 +1680,7 @@ if (Start->getOpcode() == ARM::t2WhileLoopStart) RevertWhile(Start); else - Start->eraseFromParent(); + RevertDo(Start); } for (auto *Dec : Decs) RevertLoopDec(Dec); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1679,7 +1679,7 @@ switch (Call->getIntrinsicID()) { default: break; - case Intrinsic::set_loop_iterations: + case Intrinsic::start_loop_iterations: case Intrinsic::test_set_loop_iterations: case Intrinsic::loop_decrement: case Intrinsic::loop_decrement_reg: diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -188,7 +188,7 @@ continue; Intrinsic::ID ID = Call->getIntrinsicID(); - if (ID == Intrinsic::set_loop_iterations || + if (ID == Intrinsic::start_loop_iterations || ID == Intrinsic::test_set_loop_iterations) return cast(&I); } diff --git a/llvm/test/CodeGen/ARM/machine-outliner-unoutlinable.mir b/llvm/test/CodeGen/ARM/machine-outliner-unoutlinable.mir --- a/llvm/test/CodeGen/ARM/machine-outliner-unoutlinable.mir +++ b/llvm/test/CodeGen/ARM/machine-outliner-unoutlinable.mir @@ -152,7 +152,7 @@ $q5 = MVE_VDUP32 $r3, 0, $noreg, $q5 $q4 = MVE_VDUP32 $r4, 0, $noreg, $q4 $q0 = MVE_VADDf32 $q4, $q5, 0, $noreg, $q0 - t2DoLoopStart $r4 + $lr = t2DoLoopStart $r4 $r0 = MVE_VMOV_from_lane_32 renamable $q0, 1, 14, $noreg tBL 14, $noreg, @z bb.1: @@ -160,7 +160,7 @@ $q5 = MVE_VDUP32 $r3, 0, $noreg, $q5 $q4 = MVE_VDUP32 $r4, 0, $noreg, $q4 $q0 = MVE_VADDf32 $q4, $q5, 0, $noreg, $q0 - t2DoLoopStart $r4 + $lr = t2DoLoopStart $r4 $r0 = MVE_VMOV_from_lane_32 renamable $q0, 1, 14, $noreg tBL 14, $noreg, @z bb.2: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll @@ -21,7 +21,7 @@ ; CHECK-END: b .LBB0_2 define void @check_loop_dec_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body.preheader for.body.preheader: @@ -49,7 +49,7 @@ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] - %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ] + %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ] br label %for.body for.cond.cleanup: @@ -64,7 +64,7 @@ ; CHECK-MID: tB %bb.2 define void @check_loop_dec_ugt_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body.preheader for.body.preheader: @@ -92,7 +92,7 @@ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] - %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ] + %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ] br label %for.body for.cond.cleanup: @@ -107,7 +107,7 @@ ; CHECK-MID: tB %bb.2 define void @check_loop_dec_ult_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body.preheader for.body.preheader: @@ -135,7 +135,7 @@ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] - %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ] + %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ] br label %for.body for.cond.cleanup: @@ -150,7 +150,7 @@ ; CHECK-MID: tB %bb.2 define void @check_loop_dec_ult_xor_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body.preheader for.body.preheader: @@ -179,7 +179,7 @@ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] - %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ] + %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ] br label %for.body for.cond.cleanup: @@ -194,7 +194,7 @@ ; CHECK-MID: tB %bb.2 define void @check_loop_dec_sgt_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body.preheader for.body.preheader: @@ -222,7 +222,7 @@ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] - %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ] + %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ] br label %for.body for.cond.cleanup: @@ -237,7 +237,7 @@ ; CHECK-MID: tB %bb.2 define void @check_loop_dec_sge_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body.preheader for.body.preheader: @@ -265,7 +265,7 @@ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] - %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ] + %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ] br label %for.body for.cond.cleanup: @@ -280,7 +280,7 @@ ; CHECK-MID: tB %bb.2 define void @check_loop_dec_sge_xor_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body.preheader for.body.preheader: @@ -309,7 +309,7 @@ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] - %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ] + %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ] br label %for.body for.cond.cleanup: @@ -324,7 +324,7 @@ ; CHECK-MID: tB %bb.2 define void @check_loop_dec_uge_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body.preheader for.body.preheader: @@ -352,7 +352,7 @@ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] - %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ] + %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ] br label %for.body for.cond.cleanup: @@ -367,7 +367,7 @@ ; CHECK-MID: tB %bb.2 define void @check_loop_dec_uge_xor_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body.preheader for.body.preheader: @@ -396,7 +396,7 @@ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] - %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ] + %count = phi i32 [ %start, %for.body.preheader ], [ %count.next, %for.body ] br label %for.body for.cond.cleanup: @@ -507,6 +507,6 @@ ret void } -declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.start.loop.iterations.i32(i32) declare i1 @llvm.test.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll @@ -17,17 +17,17 @@ ; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], 1 ; CHECK-NEXT: br i1 [[TOBOOL]], label [[VECTOR_BODY75_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.body75.preheader: -; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: [[START1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]]) ; CHECK-NEXT: br label [[VECTOR_BODY75:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT71:%.*]] = insertelement <4 x i32> undef, i32 [[X]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT72:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT71]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP3]]) +; CHECK-NEXT: [[START2:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP3]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV9:%.*]] = phi i32* [ [[SCEVGEP10:%.*]], [[VECTOR_BODY]] ], [ [[D:%.*]], [[VECTOR_PH]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[START2]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[N]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV911:%.*]] = bitcast i32* [[LSR_IV9]] to <4 x i32>* ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 @@ -48,7 +48,7 @@ ; CHECK-NEXT: [[LSR_IV3:%.*]] = phi i32* [ [[S2:%.*]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP4:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[D]], [[VECTOR_BODY75_PREHEADER]] ], [ [[SCEVGEP:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[INDEX80:%.*]] = phi i32 [ [[INDEX_NEXT81:%.*]], [[VECTOR_BODY75]] ], [ 0, [[VECTOR_BODY75_PREHEADER]] ] -; CHECK-NEXT: [[TMP12:%.*]] = phi i32 [ [[TMP2]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP17:%.*]], [[VECTOR_BODY75]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi i32 [ [[START1]], [[VECTOR_BODY75_PREHEADER]] ], [ [[TMP17:%.*]], [[VECTOR_BODY75]] ] ; CHECK-NEXT: [[LSR_IV68:%.*]] = bitcast i32* [[LSR_IV6]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV35:%.*]] = bitcast i32* [[LSR_IV3]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV2:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* @@ -88,19 +88,19 @@ br i1 %tobool, label %vector.body75.preheader, label %vector.ph vector.body75.preheader: ; preds = %for.body.lr.ph - call void @llvm.set.loop.iterations.i32(i32 %2) + %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %2) br label %vector.body75 vector.ph: ; preds = %for.body.lr.ph %broadcast.splatinsert71 = insertelement <4 x i32> undef, i32 %x, i32 0 %broadcast.splat72 = shufflevector <4 x i32> %broadcast.splatinsert71, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %3) + %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %3) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv9 = phi i32* [ %scevgep10, %vector.body ], [ %d, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %4 = phi i32 [ %3, %vector.ph ], [ %8, %vector.body ] + %4 = phi i32 [ %start2, %vector.ph ], [ %8, %vector.body ] %lsr.iv911 = bitcast i32* %lsr.iv9 to <4 x i32>* %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer @@ -120,7 +120,7 @@ %lsr.iv3 = phi i32* [ %s2, %vector.body75.preheader ], [ %scevgep4, %vector.body75 ] %lsr.iv = phi i32* [ %d, %vector.body75.preheader ], [ %scevgep, %vector.body75 ] %index80 = phi i32 [ %index.next81, %vector.body75 ], [ 0, %vector.body75.preheader ] - %10 = phi i32 [ %2, %vector.body75.preheader ], [ %15, %vector.body75 ] + %10 = phi i32 [ %start1, %vector.body75.preheader ], [ %15, %vector.body75 ] %lsr.iv68 = bitcast i32* %lsr.iv6 to <4 x i32>* %lsr.iv35 = bitcast i32* %lsr.iv3 to <4 x i32>* %lsr.iv2 = bitcast i32* %lsr.iv to <4 x i32>* @@ -148,7 +148,7 @@ declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) -declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cmplx_cong.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cmplx_cong.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cmplx_cong.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cmplx_cong.mir @@ -79,7 +79,7 @@ $r4 = t2MOVTi16 killed $r4, target-flags(arm-hi16) @arm_cmplx_conj_f32_mve.cmplx_conj_sign, 14 /* CC::al */, $noreg renamable $q0 = nnan ninf nsz MVE_VLDRWU32 killed renamable $r4, 0, 0, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.1 (align 4): successors: %bb.1(0x7c000000), %bb.2(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir @@ -9,13 +9,13 @@ entry: %scevgep = getelementptr i32, i32* %q, i32 -1 %scevgep3 = getelementptr i32, i32* %p, i32 -1 - call void @llvm.set.loop.iterations.i32(i32 %n) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %n) br label %while.body while.body: ; preds = %while.body, %entry %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ] %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ] - %0 = phi i32 [ %n, %entry ], [ %2, %while.body ] + %0 = phi i32 [ %start, %entry ], [ %2, %while.body ] %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1 %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1 %1 = load i32, i32* %scevgep6, align 4 @@ -30,7 +30,7 @@ ret i32 0 } - declare void @llvm.set.loop.iterations.i32(i32) #0 + declare i32 @llvm.start.loop.iterations.i32(i32) #0 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 attributes #0 = { noduplicate nounwind } @@ -112,7 +112,7 @@ frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 $lr = tMOVr $r0, 14, $noreg - t2DoLoopStart killed $r0 + $lr = t2DoLoopStart killed $r0 renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -15,9 +15,9 @@ ; CHECK-NEXT: bic r12, r12, #3 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: add.w lr, lr, r12, lsr #2 +; CHECK-NEXT: add.w r12, lr, r12, lsr #2 +; CHECK-NEXT: dls lr, r12 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and r4, r12, #15 @@ -107,9 +107,9 @@ ; CHECK-NEXT: bic r4, r4, #3 ; CHECK-NEXT: sub.w lr, r4, #4 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: add.w r4, r4, lr, lsr #2 +; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and r5, r4, #15 @@ -210,9 +210,9 @@ ; CHECK-NEXT: bic r4, r4, #3 ; CHECK-NEXT: sub.w lr, r4, #4 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: add.w r4, r4, lr, lsr #2 +; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r12 @@ -309,9 +309,9 @@ ; CHECK-NEXT: bic r4, r4, #3 ; CHECK-NEXT: sub.w lr, r4, #4 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: add.w r4, r4, lr, lsr #2 +; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r12 @@ -402,8 +402,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB4_1: @ %bb3 -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: .LBB4_2: @ %bb9 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r3, #4 @@ -464,8 +464,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB5_1: @ %bb4 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB5_2: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0] diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir @@ -10,11 +10,11 @@ br i1 %cmp, label %exit, label %loop.ph loop.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %iters) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters) br label %loop.body loop.body: ; preds = %loop.body, %loop.ph - %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ] %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] %addr.a = phi <8 x i16>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] %addr.b = phi <8 x i16>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] @@ -46,11 +46,11 @@ br i1 %cmp, label %exit, label %loop.ph loop.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %iters) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters) br label %loop.body loop.body: ; preds = %loop.body, %loop.ph - %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ] %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] @@ -82,11 +82,11 @@ br i1 %cmp, label %exit, label %loop.ph loop.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %iters) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters) br label %loop.body loop.body: ; preds = %loop.body, %loop.ph - %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ] %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] @@ -115,7 +115,7 @@ declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1 immarg) declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1 immarg) declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1 immarg) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) @@ -166,23 +166,23 @@ ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate - ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate ; CHECK: bb.1.loop.ph: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2, $r3 - ; CHECK: renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) - ; CHECK: dead $lr = t2DLS renamable $r12 - ; CHECK: $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: $r4 = tMOVr killed $lr, 14 /* CC::al */, $noreg ; CHECK: bb.2.loop.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4 + ; CHECK: $lr = tMOVr $r4, 14 /* CC::al */, $noreg ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg ; CHECK: MVE_VPST 4, implicit $vpr ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 2) ; CHECK: renamable $q1 = MVE_VLDRHU16 killed renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 2) - ; CHECK: $lr = tMOVr $r4, 14 /* CC::al */, $noreg - ; CHECK: renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VCLZs8 killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: $r0 = tMOVr $r1, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VQSHRUNs16th killed renamable $q1, killed renamable $q0, 1, 0, $noreg @@ -190,7 +190,7 @@ ; CHECK: renamable $r2 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 2) ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.exit: - ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) liveins: $r0, $r1, $r2, $r3, $r4, $lr @@ -201,27 +201,27 @@ frame-setup CFI_INSTRUCTION offset $r4, -8 tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate - tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate bb.1.loop.ph: successors: %bb.2(0x80000000) - liveins: $r0, $r1, $r2, $r3, $r4, $lr + liveins: $r0, $r1, $r2, $r3 - renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r12 - $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg + renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + renamable $lr = t2DoLoopStart killed renamable $lr + $r4 = tMOVr killed $lr, 14 /* CC::al */, $noreg bb.2.loop.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3, $r4 + $lr = tMOVr $r4, 14 /* CC::al */, $noreg renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg MVE_VPST 4, implicit $vpr renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 2) renamable $q1 = MVE_VLDRHU16 killed renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 2) - $lr = tMOVr $r4, 14 /* CC::al */, $noreg - renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14 /* CC::al */, $noreg + renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg renamable $q1 = MVE_VCLZs8 killed renamable $q1, 0, $noreg, undef renamable $q1 renamable $lr = t2LoopDec killed renamable $lr, 1 $r0 = tMOVr $r1, 14 /* CC::al */, $noreg @@ -232,7 +232,7 @@ tB %bb.3, 14 /* CC::al */, $noreg bb.3.exit: - tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc ... --- @@ -267,68 +267,69 @@ ; CHECK-LABEL: name: test_ctlz_i16 ; CHECK: bb.0.entry: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4 - ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r7 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 - ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate - ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def dead $r7, def $pc, implicit killed $itstate ; CHECK: bb.1.loop.ph: ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $r0, $r1, $r2, $r3 - ; CHECK: renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) - ; CHECK: dead $lr = t2DLS renamable $r4 - ; CHECK: $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4 + ; CHECK: renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: $r12 = tMOVr killed $lr, 14 /* CC::al */, $noreg ; CHECK: bb.2.loop.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $r0, $r1, $r2, $r3, $r12 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r12 ; CHECK: $lr = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg ; CHECK: MVE_VPST 4, implicit $vpr ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) - ; CHECK: renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $q1 = MVE_VCLZs16 killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.exit: - ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + ; CHECK: liveins: $r4 + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def dead $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) - liveins: $r0, $r1, $r2, $r3, $r4, $lr + liveins: $r0, $r1, $r2, $r3, $r7, $lr - frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 - frame-setup CFI_INSTRUCTION offset $r4, -8 + frame-setup CFI_INSTRUCTION offset $r7, -8 tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate - tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate bb.1.loop.ph: successors: %bb.2(0x80000000) - liveins: $r0, $r1, $r2, $r3, $r4, $lr + liveins: $r0, $r1, $r2, $r3 - renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r4 - $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + renamable $lr = t2DoLoopStart killed renamable $lr + $r12 = tMOVr killed $lr, 14 /* CC::al */, $noreg bb.2.loop.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3, $r12 - renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg $lr = tMOVr $r12, 14 /* CC::al */, $noreg + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg MVE_VPST 4, implicit $vpr renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) - renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg renamable $q1 = MVE_VCLZs16 killed renamable $q1, 0, $noreg, undef renamable $q1 renamable $lr = t2LoopDec killed renamable $lr, 1 renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg @@ -338,7 +339,7 @@ tB %bb.3, 14 /* CC::al */, $noreg bb.3.exit: - tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... --- @@ -373,68 +374,69 @@ ; CHECK-LABEL: name: test_ctlz_i32 ; CHECK: bb.0.entry: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4 - ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r7 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 - ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate - ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def dead $r7, def $pc, implicit killed $itstate ; CHECK: bb.1.loop.ph: ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $r0, $r1, $r2, $r3 - ; CHECK: renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) - ; CHECK: dead $lr = t2DLS renamable $r4 - ; CHECK: $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4 + ; CHECK: renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: $r12 = tMOVr killed $lr, 14 /* CC::al */, $noreg ; CHECK: bb.2.loop.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $r0, $r1, $r2, $r3, $r12 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r12 ; CHECK: $lr = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg ; CHECK: MVE_VPST 4, implicit $vpr ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) ; CHECK: renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) - ; CHECK: renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $q1 = MVE_VCLZs32 killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $q0 = MVE_VQSHRUNs32th killed renamable $q0, killed renamable $q1, 3, 0, $noreg ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: renamable $r2 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.exit: - ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + ; CHECK: liveins: $r4 + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def dead $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) - liveins: $r0, $r1, $r2, $r3, $r4, $lr + liveins: $r0, $r1, $r2, $r3, $r7, $lr - frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 - frame-setup CFI_INSTRUCTION offset $r4, -8 + frame-setup CFI_INSTRUCTION offset $r7, -8 tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate - tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate bb.1.loop.ph: successors: %bb.2(0x80000000) - liveins: $r0, $r1, $r2, $r3, $r4, $lr + liveins: $r0, $r1, $r2, $r3 - renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r4 - $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + renamable $lr = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + renamable $lr = t2DoLoopStart killed renamable $lr + $r12 = tMOVr killed $lr, 14 /* CC::al */, $noreg bb.2.loop.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $r0, $r1, $r2, $r3, $r12 - renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg $lr = tMOVr $r12, 14 /* CC::al */, $noreg + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg MVE_VPST 4, implicit $vpr renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) - renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg renamable $q1 = MVE_VCLZs32 killed renamable $q1, 0, $noreg, undef renamable $q1 renamable $lr = t2LoopDec killed renamable $lr, 1 renamable $q0 = MVE_VQSHRUNs32th killed renamable $q0, killed renamable $q1, 3, 0, $noreg @@ -444,6 +446,6 @@ tB %bb.3, 14 /* CC::al */, $noreg bb.3.exit: - tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir @@ -19,7 +19,7 @@ br i1 %tmp, label %bb27, label %bb3 bb3: ; preds = %bb - call void @llvm.set.loop.iterations.i32(i32 %tmp6) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp6) %scevgep1 = getelementptr i32, i32* %arg3, i32 -4 br label %bb9 @@ -27,7 +27,7 @@ %lsr.iv4 = phi i32* [ %scevgep6, %bb9 ], [ %scevgep1, %bb3 ] %lsr.iv2 = phi i32* [ %scevgep3, %bb9 ], [ %arg1, %bb3 ] %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ] - %tmp7 = phi i32 [ %tmp6, %bb3 ], [ %tmp12, %bb9 ] + %tmp7 = phi i32 [ %start, %bb3 ], [ %tmp12, %bb9 ] %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ] %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>* @@ -56,7 +56,7 @@ } declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) @@ -197,7 +197,7 @@ VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0) renamable $q0 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q0 $r3 = tMOVr $r0, 14, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.bb9: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir @@ -11,14 +11,14 @@ %2 = sub i32 %0, %smin %3 = lshr i32 %2, 2 %4 = add nuw nsw i32 %3, 1 - call void @llvm.set.loop.iterations.i32(i32 %4) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %4) br label %do.body do.body: ; preds = %do.body, %entry %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ] %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ] %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ] - %5 = phi i32 [ %4, %entry ], [ %9, %do.body ] + %5 = phi i32 [ %start, %entry ], [ %9, %do.body ] %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0) %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>* %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef) @@ -38,7 +38,7 @@ declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) ... @@ -136,7 +136,7 @@ renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg renamable $r2 = tLEApcrel %const.0, 14, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool) - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.1.do.body (align 4): successors: %bb.1(0x7c000000), %bb.2(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir @@ -19,14 +19,14 @@ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* @@ -47,7 +47,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare <4 x i1> @llvm.arm.vctp32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) @@ -162,7 +162,7 @@ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/emptyblock.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/emptyblock.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/emptyblock.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/emptyblock.mir @@ -24,14 +24,14 @@ %5 = sub i32 %3, %smin36 %6 = lshr i32 %5, 2 %7 = add nuw nsw i32 %6, 1 - call void @llvm.set.loop.iterations.i32(i32 %7) + %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %7) br label %do.body do.body: ; preds = %do.body, %entry %count.0 = phi i32 [ %0, %entry ], [ %12, %do.body ] %pInT.0 = phi float* [ %pIn, %entry ], [ %add.ptr, %do.body ] %sumVec.0 = phi <4 x float> [ zeroinitializer, %entry ], [ %11, %do.body ] - %8 = phi i32 [ %7, %entry ], [ %13, %do.body ] + %8 = phi i32 [ %start1, %entry ], [ %13, %do.body ] %pInT.033 = bitcast float* %pInT.0 to <4 x float>* %9 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %count.0) %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pInT.033, i32 4, <4 x i1> %9, <4 x float> zeroinitializer) @@ -125,7 +125,7 @@ %50 = bitcast float* %arrayidx17 to <4 x float>* %51 = load <4 x float>, <4 x float>* %50, align 4 %52 = fmul fast <4 x float> %51, %40 - call void @llvm.set.loop.iterations.i32(i32 %33) + %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %33) br label %do.body24 do.body24: ; preds = %do.body24, %for.body @@ -138,7 +138,7 @@ %sumVec1.0 = phi <4 x float> [ %46, %for.body ], [ %58, %do.body24 ] %sumVec2.0 = phi <4 x float> [ %49, %for.body ], [ %60, %do.body24 ] %sumVec3.0 = phi <4 x float> [ %52, %for.body ], [ %62, %do.body24 ] - %53 = phi i32 [ %33, %for.body ], [ %63, %do.body24 ] + %53 = phi i32 [ %start2, %for.body ], [ %63, %do.body24 ] %lsr.iv4 = bitcast float* %lsr.iv to <4 x float>* %lsr.iv911 = bitcast float* %lsr.iv9 to <4 x float>* %lsr.iv1618 = bitcast float* %lsr.iv16 to <4 x float>* @@ -219,7 +219,7 @@ %k.1200 = phi i32 [ %inc, %do.end66 ], [ %k.0.lcssa, %for.body56.preheader ] %mul57 = mul i32 %k.1200, %0 %arrayidx58 = getelementptr inbounds float, float* %2, i32 %mul57 - call void @llvm.set.loop.iterations.i32(i32 %38) + %start3 = call i32 @llvm.start.loop.iterations.i32(i32 %38) br label %do.body59 do.body59: ; preds = %do.body59, %for.body56 @@ -227,7 +227,7 @@ %pInT.2 = phi float* [ %pIn, %for.body56 ], [ %add.ptr61, %do.body59 ] %pCos0.1 = phi float* [ %arrayidx58, %for.body56 ], [ %add.ptr62, %do.body59 ] %sumVec.1 = phi <4 x float> [ zeroinitializer, %for.body56 ], [ %93, %do.body59 ] - %89 = phi i32 [ %38, %for.body56 ], [ %95, %do.body59 ] + %89 = phi i32 [ %start3, %for.body56 ], [ %95, %do.body59 ] %pInT.21 = bitcast float* %pInT.2 to <4 x float>* %pCos0.12 = bitcast float* %pCos0.1 to <4 x float>* %90 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %count.2) @@ -264,7 +264,7 @@ declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #3 declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) #1 - declare void @llvm.set.loop.iterations.i32(i32) #4 + declare i32 @llvm.start.loop.iterations.i32(i32) #4 declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4 ... @@ -414,7 +414,7 @@ $r0 = tMOVr $r4, 14 /* CC::al */, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 $r1 = tMOVr $r5, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.1.do.body (align 4): successors: %bb.1(0x7c000000), %bb.2(0x04000000) @@ -503,7 +503,7 @@ $r3 = tMOVr $r10, 14 /* CC::al */, $noreg $r5 = tMOVr $r1, 14 /* CC::al */, $noreg $r4 = tMOVr $r12, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr $r7 = tMOVr $r6, 14 /* CC::al */, $noreg renamable $r11 = t2LDRi12 $sp, 16, 14 /* CC::al */, $noreg :: (load 4 from %stack.5) @@ -592,7 +592,7 @@ $r6 = tMOVr $r4, 14 /* CC::al */, $noreg $r7 = tMOVr $r5, 14 /* CC::al */, $noreg $lr = tMOVr $r3, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 bb.13: successors: %bb.10(0x80000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir @@ -7,7 +7,7 @@ define void @size_limit(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 @@ -35,7 +35,7 @@ %lsr.iv9 = phi i32* [ %scevgep8, %entry ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %entry ], [ %scevgep6, %for.body ] %lsr.iv1 = phi i32* [ %scevgep, %entry ], [ %scevgep2, %for.body ] - %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ] + %count = phi i32 [ %start, %entry ], [ %count.next, %for.body ] br label %for.body } @@ -43,7 +43,7 @@ declare i32 @llvm.arm.space(i32 immarg, i32) #0 ; Function Attrs: noduplicate nounwind - declare void @llvm.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.start.loop.iterations.i32(i32) #1 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 @@ -184,7 +184,7 @@ frame-setup CFI_INSTRUCTION offset $r7, -8 $sp = frame-setup tSUBspi $sp, 8, 14, $noreg frame-setup CFI_INSTRUCTION def_cfa_offset 40 - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll @@ -8,21 +8,21 @@ ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: ldrd r12, r4, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] +; CHECK-NEXT: ldrd r2, r3, [r0, #8] ; CHECK-NEXT: rsb r12, r12, r4, lsl #1 +; CHECK-NEXT: dlstp.16 lr, r12 ; CHECK-NEXT: mov r4, r12 -; CHECK-NEXT: dlstp.16 lr, r4 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q0, [r2], #16 -; CHECK-NEXT: vstrh.16 q0, [r3], #16 +; CHECK-NEXT: vldrh.u16 q0, [r3], #16 +; CHECK-NEXT: vstrh.16 q0, [r2], #16 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end -; CHECK-NEXT: ldr r3, [r0] +; CHECK-NEXT: ldr r2, [r0] ; CHECK-NEXT: ldr r0, [r0, #8] ; CHECK-NEXT: vmov.i16 q0, #0x1800 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 -; CHECK-NEXT: dlstp.16 lr, r3 +; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB0_3: @ %do.body6 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll @@ -9,8 +9,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB0_1: @ %vector.ph -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.16 lr, r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r3, #8 @@ -69,8 +69,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB1_1: @ %vector.ph -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.16 lr, r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r3, #8 @@ -129,8 +129,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB2_1: @ %vector.ph -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r3, #4 @@ -189,8 +189,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB3_1: @ %vector.ph -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r3, #4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir @@ -17,11 +17,11 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ] @@ -52,7 +52,7 @@ ret i32 %res.0.lcssa } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) @@ -155,7 +155,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -49,10 +49,10 @@ ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader.new ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: add.w lr, r12, r3, lsr #2 -; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: add.w r3, r12, r3, lsr #2 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, r1, r3 @@ -228,9 +228,9 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 @@ -321,11 +321,12 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r2, #1 ; CHECK-NEXT: adr r2, .LCPI2_1 -; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: mov lr, r3 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vdup.32 q2, r12 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir @@ -13,14 +13,14 @@ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv17 = phi i16* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] %lsr.iv14 = phi i16* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %B, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv13 = bitcast i16* %lsr.iv to <8 x i16>* %lsr.iv1416 = bitcast i16* %lsr.iv14 to <8 x i16>* @@ -41,7 +41,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) @@ -149,7 +149,7 @@ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir @@ -20,14 +20,14 @@ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* @@ -48,7 +48,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) @@ -157,7 +157,7 @@ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir @@ -13,14 +13,14 @@ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv17 = phi i8* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] %lsr.iv14 = phi i8* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %B, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv13 = bitcast i8* %lsr.iv to <16 x i8>* %lsr.iv1416 = bitcast i8* %lsr.iv14 to <16 x i8>* @@ -41,7 +41,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare <16 x i1> @llvm.arm.mve.vctp8(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) @@ -150,7 +150,7 @@ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-1.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-1.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-1.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-1.mir @@ -16,11 +16,11 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ] %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ] %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ] @@ -64,7 +64,7 @@ } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 @@ -201,7 +201,7 @@ renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr $r4 = tMOVr killed $lr, 14, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-2.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-2.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-2.mir @@ -16,11 +16,11 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ] %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ] %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ] @@ -64,7 +64,7 @@ } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 @@ -201,7 +201,7 @@ renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr $r4 = tMOVr killed $lr, 14, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-3.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-3.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-3.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-3.mir @@ -16,11 +16,11 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ] %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ] %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ] @@ -64,7 +64,7 @@ } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 @@ -201,7 +201,7 @@ renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr $r4 = tMOVr killed $lr, 14, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir @@ -16,11 +16,11 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ] %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ] %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] @@ -65,7 +65,7 @@ } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 @@ -200,7 +200,7 @@ renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg renamable $r5 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $r5 + $lr = t2DoLoopStart renamable $r5 $r4 = tMOVr killed $r5, 14, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir @@ -18,11 +18,11 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ] %lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ] %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] @@ -67,7 +67,7 @@ } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 @@ -201,7 +201,7 @@ renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg renamable $r5 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $r5 + $lr = t2DoLoopStart renamable $r5 $r4 = tMOVr killed $r5, 14, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir @@ -14,11 +14,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv20 = phi i32* [ %scevgep20, %vector.body ], [ %c, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ] @@ -55,11 +55,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>* @@ -92,11 +92,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>* @@ -120,7 +120,7 @@ declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) @@ -204,7 +204,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -321,7 +321,7 @@ renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r1, 19, 14 /* CC::al */, $noreg, $noreg renamable $r1 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 $r1 = tMOVr killed $r3, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -440,7 +440,7 @@ renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r1, 19, 14 /* CC::al */, $noreg, $noreg renamable $r1 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 $r1 = tMOVr killed $r3, 14 /* CC::al */, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir @@ -12,11 +12,11 @@ %3 = lshr i32 %2, 2 %4 = add nuw nsw i32 %3, 1 store i32 %4, i32* %iter.addr, align 4 - call void @llvm.set.loop.iterations.i32(i32 %4) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %4) br label %do.body do.body: ; preds = %do.body, %entry - %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %4, %entry ] + %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %start, %entry ] %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ] %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ] %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ] @@ -47,12 +47,12 @@ %2 = sub i32 %0, %smin %3 = lshr i32 %2, 2 %4 = add nuw nsw i32 %3, 1 - call void @llvm.set.loop.iterations.i32(i32 %4) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %4) store i32 %4, i32* %iter.addr, align 4 br label %do.body do.body: ; preds = %do.body, %entry - %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %4, %entry ] + %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %start, %entry ] %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ] %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ] %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ] @@ -84,7 +84,7 @@ declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) #3 ; Function Attrs: noduplicate nounwind - declare void @llvm.set.loop.iterations.i32(i32) #4 + declare i32 @llvm.start.loop.iterations.i32(i32) #4 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4 @@ -178,7 +178,7 @@ renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr) - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg bb.1.do.body: @@ -247,8 +247,8 @@ ; CHECK: renamable $lr = t2ADDri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr) + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg ; CHECK: bb.1.do.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) @@ -282,8 +282,8 @@ renamable $lr = t2ADDri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $lr t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr) + $lr = t2DoLoopStart renamable $lr $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg bb.1.do.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir @@ -13,14 +13,14 @@ %2 = sub i32 %0, %smin %3 = lshr i32 %2, 2 %4 = add nuw nsw i32 %3, 1 - call void @llvm.set.loop.iterations.i32(i32 %4) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %4) br label %do.body do.body: ; preds = %do.body, %entry %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ] %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ] %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ] - %5 = phi i32 [ %4, %entry ], [ %9, %do.body ] + %5 = phi i32 [ %start, %entry ], [ %9, %do.body ] %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0) %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>* %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef) @@ -40,7 +40,7 @@ declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) ... @@ -149,7 +149,7 @@ renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg renamable $r2 = tLEApcrel %const.0, 14, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool) - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.1.do.body (align 4): successors: %bb.1(0x7c000000), %bb.2(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir @@ -14,14 +14,14 @@ %2 = sub i32 %0, %smin %3 = lshr i32 %2, 2 %4 = add nuw nsw i32 %3, 1 - call void @llvm.set.loop.iterations.i32(i32 %4) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %4) br label %do.body do.body: ; preds = %do.body, %entry %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ] %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ] %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ] - %5 = phi i32 [ %4, %entry ], [ %9, %do.body ] + %5 = phi i32 [ %start, %entry ], [ %9, %do.body ] %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0) %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>* %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef) @@ -41,7 +41,7 @@ declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) ... @@ -140,7 +140,7 @@ renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg renamable $r2 = tLEApcrel %const.0, 14, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool) - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.1.do.body (align 4): successors: %bb.1(0x7c000000), %bb.2(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir @@ -78,6 +78,7 @@ ; CHECK: successors: %bb.5(0x80000000) ; CHECK: liveins: $q0, $r0, $r1, $r2, $r4 ; CHECK: renamable $s4 = nnan ninf nsz VADDS renamable $s0, renamable $s1, 14 /* CC::al */, $noreg + ; CHECK: dead $lr = tMOVr $r4, 14 /* CC::al */, $noreg ; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg ; CHECK: renamable $s4 = nnan ninf nsz VADDS renamable $s2, killed renamable $s4, 14 /* CC::al */, $noreg ; CHECK: renamable $s0 = nnan ninf nsz VADDS killed renamable $s3, killed renamable $s4, 14 /* CC::al */, $noreg, implicit killed $q0 @@ -151,7 +152,7 @@ renamable $r4 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14, $noreg, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 $r3 = tMOVr $r0, 14, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.3: successors: %bb.3(0x7c000000), %bb.4(0x04000000) @@ -178,7 +179,7 @@ renamable $s0 = nnan ninf nsz VADDS killed renamable $s3, killed renamable $s4, 14, $noreg, implicit $q0 $s2 = VMOVSR $r1, 14, $noreg renamable $s2 = VUITOS killed renamable $s2, 14, $noreg - t2DoLoopStart killed $r4 + $lr = t2DoLoopStart killed $r4 renamable $s4 = nnan ninf nsz VDIVS killed renamable $s0, killed renamable $s2, 14, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir @@ -15,14 +15,14 @@ %2 = sub i32 %0, %smin %3 = lshr i32 %2, 2 %4 = add nuw nsw i32 %3, 1 - call void @llvm.set.loop.iterations.i32(i32 %4) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %4) br label %do.body do.body: ; preds = %do.body, %entry %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ] %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ] %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ] - %5 = phi i32 [ %4, %entry ], [ %9, %do.body ] + %5 = phi i32 [ %start, %entry ], [ %9, %do.body ] %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0) %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>* %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef) @@ -42,7 +42,7 @@ declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) ... @@ -149,7 +149,7 @@ renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg renamable $r2 = tLEApcrel %const.0, 14, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool) - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.1.do.body (align 4): successors: %bb.1(0x7c000000), %bb.2(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp-reordered.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp-reordered.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp-reordered.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp-reordered.mir @@ -20,11 +20,11 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv3 = phi i32* [ %scevgep4, %vector.body ], [ %b, %vector.ph ] %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] @@ -56,7 +56,7 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare <4 x i1> @llvm.arm.mve.vctp32(i32) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) ... @@ -168,7 +168,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 $r4 = tMOVr killed $r3, 14 /* CC::al */, $noreg renamable $r3 = tLEApcrel %const.0, 14 /* CC::al */, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir @@ -18,11 +18,11 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv3 = phi i32* [ %scevgep4, %vector.body ], [ %b, %vector.ph ] %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] @@ -54,7 +54,7 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare <4 x i1> @llvm.arm.mve.vctp32(i32) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) ... --- @@ -165,7 +165,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 $r4 = tMOVr killed $r3, 14 /* CC::al */, $noreg renamable $r3 = tLEApcrel %const.0, 14 /* CC::al */, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir @@ -19,7 +19,7 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -27,7 +27,7 @@ %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] %elts.rem = phi i32 [ %N, %vector.ph ], [ %elts.rem.next, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %12, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %12, %vector.body ] %lsr.iv35 = bitcast i32* %lsr.iv3 to <4 x i32>* %lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>* %7 = insertelement <4 x i32> undef, i32 %div, i32 0 @@ -52,7 +52,7 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare <4 x i1> @llvm.arm.mve.vctp32(i32) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) ... @@ -147,7 +147,7 @@ renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) renamable $r3, dead $cpsr = tLSRri renamable $r2, 1, 14 /* CC::al */, $noreg renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1 - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/livereg-no-loop-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/livereg-no-loop-def.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/livereg-no-loop-def.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/livereg-no-loop-def.mir @@ -14,11 +14,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] %lsr.iv20 = phi i32* [ %scevgep20, %vector.body ], [ %c, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] @@ -50,7 +50,7 @@ declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) @@ -136,7 +136,7 @@ renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg renamable $r4 = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $r4 + $lr = t2DoLoopStart renamable $r4 $r12 = tMOVr killed $r4, 14, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-chain.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-chain.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-chain.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-chain.mir @@ -18,7 +18,7 @@ br i1 %tmp7, label %bb13, label %bb12 bb12: ; preds = %bb4 - call void @llvm.set.loop.iterations.i32(i32 %tmp11) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp11) br label %bb28 bb13: ; preds = %bb28, %bb4 @@ -45,7 +45,7 @@ ret void bb28: ; preds = %bb28, %bb12 - %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %tmp11, %bb12 ] + %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %start, %bb12 ] %lsr.iv = phi i32 [ %lsr.iv.next, %bb28 ], [ 0, %bb12 ] %tmp29 = phi i32 [ 0, %bb12 ], [ %tmp54, %bb28 ] %0 = bitcast i32* %arg1 to i8* @@ -145,7 +145,7 @@ br label %bb27 } - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) ... @@ -387,7 +387,7 @@ renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg renamable $r8 = t2MOVi 0, 14, $noreg, $noreg renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 $r12 = tMOVr killed $r3, 14, $noreg renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-prev-iteration.mir @@ -18,7 +18,7 @@ br i1 %tmp7, label %bb13, label %bb12 bb12: ; preds = %bb4 - call void @llvm.set.loop.iterations.i32(i32 %tmp11) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp11) br label %bb28 bb13: ; preds = %bb28, %bb4 @@ -46,7 +46,7 @@ ret i32 %res bb28: ; preds = %bb28, %bb12 - %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %tmp11, %bb12 ] + %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %start, %bb12 ] %lsr.iv = phi i32 [ %lsr.iv.next, %bb28 ], [ 0, %bb12 ] %tmp29 = phi i32 [ 0, %bb12 ], [ %tmp54, %bb28 ] %0 = bitcast i32* %arg1 to i8* @@ -146,7 +146,7 @@ br label %bb27 } - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) ... @@ -265,7 +265,8 @@ ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $r8 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $lr = t2DLS killed renamable $r3 + ; CHECK: dead $lr = t2DLS renamable $r3 + ; CHECK: $lr = tMOVr killed $r3, 14 /* CC::al */, $noreg ; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg ; CHECK: bb.5.bb28: ; CHECK: successors: %bb.5(0x7c000000), %bb.6(0x04000000) @@ -403,7 +404,7 @@ renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg renamable $r8 = t2MOVi 0, 14, $noreg, $noreg renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 $lr = tMOVr killed $r3, 14, $noreg renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-liveout.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-liveout.mir @@ -18,7 +18,7 @@ br i1 %tmp7, label %bb13, label %bb12 bb12: ; preds = %bb4 - call void @llvm.set.loop.iterations.i32(i32 %tmp11) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp11) br label %bb28 bb13: ; preds = %bb28, %bb4 @@ -46,7 +46,7 @@ ret i32 %res bb28: ; preds = %bb28, %bb12 - %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %tmp11, %bb12 ] + %lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %start, %bb12 ] %lsr.iv = phi i32 [ %lsr.iv.next, %bb28 ], [ 0, %bb12 ] %tmp29 = phi i32 [ 0, %bb12 ], [ %tmp54, %bb28 ] %0 = bitcast i32* %arg1 to i8* @@ -146,7 +146,7 @@ br label %bb27 } - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) ... @@ -265,7 +265,8 @@ ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $r8 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $lr = t2DLS killed renamable $r3 + ; CHECK: dead $lr = t2DLS renamable $r3 + ; CHECK: $lr = tMOVr killed $r3, 14 /* CC::al */, $noreg ; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg ; CHECK: bb.5.bb28: ; CHECK: successors: %bb.5(0x7c000000), %bb.6(0x04000000) @@ -403,7 +404,7 @@ renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg renamable $r8 = t2MOVi 0, 14, $noreg, $noreg renamable $r3 = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r3, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 $lr = tMOVr $r3, 14, $noreg renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s ; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-GLOBAL @@ -16,10 +15,10 @@ ; CHECK: ne_and_guard ; CHECK: body: ; CHECK: bb.0.entry: -; CHECK: t2CMPri renamable $lr, 0 +; CHECK: tCMPi8 renamable $r0, 0 ; CHECK: tBcc %bb.4 ; CHECK: bb.2.while.body.preheader: -; CHECK: $lr = t2DLS killed renamable $lr +; CHECK: $lr = t2DLS killed renamable $r0 ; CHECK: bb.3.while.body: ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.3 define void @ne_and_guard(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { @@ -49,10 +48,10 @@ ; CHECK: ne_preheader ; CHECK: body: ; CHECK: bb.0.entry: -; CHECK: t2CMPri renamable $lr, 0 +; CHECK: tCMPi8 renamable $r0, 0 ; CHECK: tBcc %bb.4 ; CHECK: bb.2.while.body.preheader: -; CHECK: $lr = t2DLS killed renamable $lr +; CHECK: $lr = t2DLS killed renamable $r0 ; CHECK: bb.3.while.body: ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.3 define void @ne_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { @@ -84,10 +83,10 @@ ; CHECK: eq_preheader ; CHECK: body: ; CHECK: bb.0.entry: -; CHECK: t2CMPri renamable $lr, 0 +; CHECK: tCMPi8 renamable $r0, 0 ; CHECK: tBcc %bb.4 ; CHECK: bb.2.while.body.preheader: -; CHECK: $lr = t2DLS killed renamable $lr +; CHECK: $lr = t2DLS killed renamable $r0 ; CHECK: bb.3.while.body: ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.3 define void @eq_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { @@ -119,10 +118,10 @@ ; CHECK: ne_prepreheader ; CHECK: body: ; CHECK: bb.0.entry: -; CHECK: t2CMPri renamable $lr, 0 +; CHECK: t2CMPri renamable $r12, 0 ; CHECK: tBcc %bb.4 ; CHECK: bb.2.while.body.preheader: -; CHECK: $lr = t2DLS killed renamable $lr +; CHECK: $lr = t2DLS killed renamable $r12 ; CHECK: bb.3.while.body: ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.3 define void @ne_prepreheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { @@ -153,7 +152,7 @@ ; CHECK: be_ne ; CHECK: body: ; CHECK: bb.0.entry: -; CHECK: $lr = t2DLS killed renamable $lr +; CHECK: $lr = t2DLS killed renamable $r12 ; CHECK: bb.2.do.body: ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 define void @be_ne(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir @@ -15,14 +15,14 @@ vector.ph: ; preds = %entry %6 = insertelement <4 x float> , float %init, i32 0 - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv13 = phi float* [ %scevgep14, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi float* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x float> [ %6, %vector.ph ], [ %13, %vector.body ] - %7 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ] + %7 = phi i32 [ %start, %vector.ph ], [ %14, %vector.body ] %8 = phi i32 [ %N, %vector.ph ], [ %10, %vector.body ] %lsr.iv12 = bitcast float* %lsr.iv to <4 x float>* %lsr.iv1315 = bitcast float* %lsr.iv13 to <4 x float>* @@ -63,14 +63,14 @@ vector.ph: ; preds = %entry %6 = insertelement <4 x float> , float %init, i32 0 - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv14 = phi float* [ %scevgep15, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi float* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x float> [ %6, %vector.ph ], [ %13, %vector.body ] - %7 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ] + %7 = phi i32 [ %start, %vector.ph ], [ %14, %vector.body ] %8 = phi i32 [ %shr, %vector.ph ], [ %10, %vector.body ] %lsr.iv13 = bitcast float* %lsr.iv to <4 x float>* %lsr.iv1416 = bitcast float* %lsr.iv14 to <4 x float>* @@ -99,7 +99,7 @@ declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) @@ -205,7 +205,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool) - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1 $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1 @@ -341,7 +341,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool) - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1 renamable $r2, dead $cpsr = tLSRri killed renamable $r2, 2, 14 /* CC::al */, $noreg $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir @@ -16,7 +16,7 @@ %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body for.cond.cleanup: ; preds = %for.body, %entry @@ -26,7 +26,7 @@ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] - %0 = phi i32 [ %N, %for.body.preheader ], [ %3, %for.body ] + %0 = phi i32 [ %start, %for.body.preheader ], [ %3, %for.body ] %size = call i32 @llvm.arm.space(i32 4096, i32 undef) %scevgep3 = getelementptr i32, i32* %lsr.iv9, i32 1 %1 = load i32, i32* %scevgep3, align 4 @@ -47,7 +47,7 @@ declare i32 @llvm.arm.space(i32 immarg, i32) #0 ; Function Attrs: noduplicate nounwind - declare void @llvm.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.start.loop.iterations.i32(i32) #1 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 @@ -157,7 +157,7 @@ renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg $lr = tMOVr $r3, 14, $noreg - t2DoLoopStart killed $r3 + $lr = t2DoLoopStart killed $r3 bb.2.for.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir @@ -26,14 +26,14 @@ call void @llvm.dbg.value(metadata i32 0, metadata !31, metadata !DIExpression()), !dbg !32 %arrayidx7.us = getelementptr inbounds i32, i32* %e, i32 %i.031.us, !dbg !38 %arrayidx7.promoted.us = load i32, i32* %arrayidx7.us, align 4, !dbg !41 - call void @llvm.set.loop.iterations.i32(i32 %d), !dbg !46 + %start = call i32 @llvm.start.loop.iterations.i32(i32 %d), !dbg !46 br label %for.body3.us, !dbg !46 for.body3.us: ; preds = %for.body3.us, %for.cond1.preheader.us %lsr.iv5 = phi i16* [ %scevgep6, %for.body3.us ], [ %lsr.iv2, %for.cond1.preheader.us ], !dbg !32 %lsr.iv1 = phi i16* [ %scevgep, %for.body3.us ], [ %l, %for.cond1.preheader.us ], !dbg !32 %add829.us = phi i32 [ %arrayidx7.promoted.us, %for.cond1.preheader.us ], [ %add8.us, %for.body3.us ], !dbg !32 - %1 = phi i32 [ %d, %for.cond1.preheader.us ], [ %4, %for.body3.us ], !dbg !32 + %1 = phi i32 [ %start, %for.cond1.preheader.us ], [ %4, %for.body3.us ], !dbg !32 call void @llvm.dbg.value(metadata i32 undef, metadata !31, metadata !DIExpression()), !dbg !32 %2 = load i16, i16* %lsr.iv5, align 2, !dbg !47 %conv.us = sext i16 %2 to i32, !dbg !47 @@ -67,7 +67,7 @@ } declare !dbg !4 dso_local arm_aapcscc signext i16 @get_input(i32, i32*, i16 signext) declare void @llvm.dbg.value(metadata, metadata, metadata) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) !llvm.dbg.cu = !{!0} @@ -325,7 +325,7 @@ $r3 = tMOVr $r5, 14, $noreg, debug-location !32 $r0 = tMOVr $r8, 14, $noreg, debug-location !32 $lr = tMOVr $r10, 14, $noreg, debug-location !32 - t2DoLoopStart renamable $r10, debug-location !46 + $lr = t2DoLoopStart renamable $r10, debug-location !46 bb.3.for.body3.us: successors: %bb.3(0x7c000000), %bb.4(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir @@ -10,7 +10,7 @@ br i1 %cmp19.i, label %for.body.i.preheader, label %c.exit.thread for.body.i.preheader: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %d) + %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %d) br label %for.body.i c.exit.thread: ; preds = %entry @@ -22,7 +22,7 @@ %lsr.iv15 = phi i32* [ %e, %for.body.i.preheader ], [ %scevgep16, %for.body.i ] %h.022.i = phi i16 [ %h.1.i, %for.body.i ], [ 0, %for.body.i.preheader ] %f.020.i = phi i32 [ %f.1.i, %for.body.i ], [ undef, %for.body.i.preheader ] - %0 = phi i32 [ %d, %for.body.i.preheader ], [ %2, %for.body.i ] + %0 = phi i32 [ %start1, %for.body.i.preheader ], [ %2, %for.body.i ] %1 = load i32, i32* %lsr.iv15, align 4 %add.i = add nsw i32 %1, %f.020.i %cmp1.i = icmp sgt i32 %add.i, 0 @@ -60,14 +60,14 @@ %arrayidx12.us = getelementptr inbounds i32, i32* %e, i32 %i.064.us %arrayidx12.promoted.us = load i32, i32* %arrayidx12.us, align 4 %11 = insertelement <4 x i32> , i32 %arrayidx12.promoted.us, i32 0 - call void @llvm.set.loop.iterations.i32(i32 %8) + %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %8) br label %vector.body vector.body: ; preds = %vector.body, %for.cond4.preheader.us %lsr.iv10 = phi i16* [ %scevgep11, %vector.body ], [ %lsr.iv7, %for.cond4.preheader.us ] %lsr.iv4 = phi i16* [ %scevgep5, %vector.body ], [ %l, %for.cond4.preheader.us ] %vec.phi = phi <4 x i32> [ %11, %for.cond4.preheader.us ], [ %19, %vector.body ] - %12 = phi i32 [ %8, %for.cond4.preheader.us ], [ %20, %vector.body ] + %12 = phi i32 [ %start2, %for.cond4.preheader.us ], [ %20, %vector.body ] %13 = phi i32 [ %d, %for.cond4.preheader.us ], [ %15, %vector.body ] %lsr.iv1012 = bitcast i16* %lsr.iv10 to <4 x i16>* %lsr.iv46 = bitcast i16* %lsr.iv4 to <4 x i16>* @@ -108,14 +108,14 @@ br i1 %29, label %for.body.i57.preheader, label %c.exit59 for.body.i57.preheader: ; preds = %for.end16 - call void @llvm.set.loop.iterations.i32(i32 %d) + %start3 = call i32 @llvm.start.loop.iterations.i32(i32 %d) br label %for.body.i57 for.body.i57: ; preds = %for.body.i57, %for.body.i57.preheader %lsr.iv1 = phi i32* [ %e, %for.body.i57.preheader ], [ %scevgep, %for.body.i57 ] %h.022.i44 = phi i16 [ %h.1.i54, %for.body.i57 ], [ 0, %for.body.i57.preheader ] %f.020.i46 = phi i32 [ %f.1.i51, %for.body.i57 ], [ undef, %for.body.i57.preheader ] - %30 = phi i32 [ %d, %for.body.i57.preheader ], [ %32, %for.body.i57 ] + %30 = phi i32 [ %start3, %for.body.i57.preheader ], [ %32, %for.body.i57 ] %31 = load i32, i32* %lsr.iv1, align 4 %add.i48 = add nsw i32 %31, %f.020.i46 %cmp1.i49 = icmp sgt i32 %add.i48, 0 @@ -142,7 +142,7 @@ declare dso_local arm_aapcs_vfpcc signext i16 @crc16(...) local_unnamed_addr #0 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 @@ -385,7 +385,7 @@ renamable $r2 = IMPLICIT_DEF $r10 = tMOVr $r0, 14, $noreg $lr = tMOVr $r0, 14, $noreg - t2DoLoopStart killed renamable $r0 + $lr = t2DoLoopStart killed renamable $r0 bb.2.for.body.i: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -443,7 +443,7 @@ $r6 = tMOVr $r5, 14, $noreg $r1 = tMOVr $r8, 14, $noreg $lr = tMOVr $r0, 14, $noreg - t2DoLoopStart renamable $r0 + $lr = t2DoLoopStart renamable $r0 bb.6.vector.body: successors: %bb.6(0x7c000000), %bb.7(0x04000000) @@ -488,7 +488,7 @@ renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg renamable $r1 = IMPLICIT_DEF - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.10.for.body.i57: successors: %bb.10(0x7c000000), %bb.11(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir @@ -9,13 +9,13 @@ entry: %scevgep = getelementptr i32, i32* %q, i32 -1 %scevgep3 = getelementptr i32, i32* %p, i32 -1 - call void @llvm.set.loop.iterations.i32(i32 %n) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %n) br label %while.body while.body: ; preds = %while.body, %entry %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ] %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ] - %0 = phi i32 [ %n, %entry ], [ %2, %while.body ] + %0 = phi i32 [ %start, %entry ], [ %2, %while.body ] %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1 %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1 %1 = load i32, i32* %scevgep6, align 4 @@ -30,7 +30,7 @@ ret i32 0 } - declare void @llvm.set.loop.iterations.i32(i32) #0 + declare i32 @llvm.start.loop.iterations.i32(i32) #0 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 attributes #0 = { noduplicate nounwind } @@ -91,7 +91,8 @@ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: $lr = t2DLS killed $r0 + ; CHECK: dead $lr = t2DLS $r0 + ; CHECK: $lr = tMOVr killed $r0, 14 /* CC::al */, $noreg ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: bb.1.while.body: @@ -111,7 +112,7 @@ frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 - t2DoLoopStart $r0 + $lr = t2DoLoopStart $r0 $lr = tMOVr killed $r0, 14, $noreg renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -run-pass=arm-low-overhead-loops -tail-predication=enabled %s -o - | FileCheck %s +# TODOD: As far as I can tell this test is fine. The tail predicating the second loop means we remove the instruction that would otherwise block the first. + --- | define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) #0 { entry: @@ -15,14 +17,14 @@ %6 = sub i32 %0, %smin3 %7 = lshr i32 %6, 2 %8 = add nuw nsw i32 %7, 1 - call void @llvm.set.loop.iterations.i32(i32 %8) + %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %8) br label %do.body.i do.body.i: ; preds = %do.body.i, %entry %blkCnt.0.i = phi i32 [ %13, %do.body.i ], [ %blockSize, %entry ] %sumVec.0.i = phi <4 x float> [ %12, %do.body.i ], [ zeroinitializer, %entry ] %pSrc.addr.0.i = phi float* [ %add.ptr.i, %do.body.i ], [ %pSrc, %entry ] - %9 = phi i32 [ %8, %entry ], [ %14, %do.body.i ] + %9 = phi i32 [ %start1, %entry ], [ %14, %do.body.i ] %pSrc.addr.0.i2 = bitcast float* %pSrc.addr.0.i to <4 x float>* %10 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0.i) %11 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.0.i2, i32 4, <4 x i1> %10, <4 x float> zeroinitializer) @@ -42,14 +44,14 @@ %18 = insertelement <4 x i32> undef, i32 %17, i64 0 %19 = shufflevector <4 x i32> %18, <4 x i32> undef, <4 x i32> zeroinitializer %20 = bitcast <4 x i32> %19 to <4 x float> - call void @llvm.set.loop.iterations.i32(i32 %4) + %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %4) br label %do.body do.body: ; preds = %do.body, %arm_mean_f32_mve.exit %blkCnt.0 = phi i32 [ %blockSize, %arm_mean_f32_mve.exit ], [ %26, %do.body ] %sumVec.0 = phi <4 x float> [ zeroinitializer, %arm_mean_f32_mve.exit ], [ %25, %do.body ] %pSrc.addr.0 = phi float* [ %pSrc, %arm_mean_f32_mve.exit ], [ %add.ptr, %do.body ] - %21 = phi i32 [ %4, %arm_mean_f32_mve.exit ], [ %27, %do.body ] + %21 = phi i32 [ %start2, %arm_mean_f32_mve.exit ], [ %27, %do.body ] %pSrc.addr.01 = bitcast float* %pSrc.addr.0 to <4 x float>* %22 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0) %23 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.01, i32 4, <4 x i1> %22, <4 x float> zeroinitializer) @@ -87,7 +89,7 @@ declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1 ; Function Attrs: noduplicate nounwind - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #3 @@ -152,32 +154,22 @@ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 - ; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg - ; CHECK: tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr - ; CHECK: t2IT 10, 8, implicit-def $itstate - ; CHECK: renamable $r3 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r3, implicit killed $itstate - ; CHECK: renamable $r12 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tSUBrr renamable $r1, killed renamable $r3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 3, 14 /* CC::al */, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg ; CHECK: $r12 = tMOVr $r0, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 ; CHECK: $r4 = tMOVr $lr, 14 /* CC::al */, $noreg ; CHECK: bb.1.do.body.i: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r12 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg - ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.0.i2, align 4) - ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, killed renamable $q0 - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r4, $r12 + ; CHECK: renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.0.i2, align 4) + ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 ; CHECK: bb.2.arm_mean_f32_mve.exit: ; CHECK: successors: %bb.3(0x80000000) ; CHECK: liveins: $q0, $r0, $r1, $r2, $r4 ; CHECK: $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg + ; CHECK: dead $lr = tMOVr $r4, 14 /* CC::al */, $noreg ; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0 ; CHECK: $lr = t2DLS killed $r4 ; CHECK: renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg @@ -224,7 +216,7 @@ renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg $r3 = tMOVr $r1, 14 /* CC::al */, $noreg $r12 = tMOVr $r0, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr $r4 = tMOVr $lr, 14 /* CC::al */, $noreg bb.1.do.body.i: @@ -247,7 +239,7 @@ $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg $lr = tMOVr $r4, 14 /* CC::al */, $noreg renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, renamable $s3, 14 /* CC::al */, $noreg, implicit $q0 - t2DoLoopStart killed $r4 + $lr = t2DoLoopStart killed $r4 renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir @@ -14,13 +14,13 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ] %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ] %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>* %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) @@ -46,7 +46,7 @@ } declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) ... @@ -153,7 +153,7 @@ renamable $r5 = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg renamable $r12 = t2LSRri killed renamable $r3, 1, 14, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg - t2DoLoopStart renamable $r5 + $lr = t2DoLoopStart renamable $r5 $lr = tMOVr killed $r5, 14, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -6,35 +6,31 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: cmp r1, #4 ; CHECK-NEXT: it ge -; CHECK-NEXT: movge r3, #4 -; CHECK-NEXT: mov.w r12, #1 -; CHECK-NEXT: subs r3, r1, r3 +; CHECK-NEXT: movge r4, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r4, r1, r4 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: adds r3, #3 -; CHECK-NEXT: add.w lr, r12, r3, lsr #2 +; CHECK-NEXT: adds r4, #3 +; CHECK-NEXT: add.w r12, r3, r4, lsr #2 ; CHECK-NEXT: mov r3, r1 -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: mov r4, lr +; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: .LBB0_1: @ %do.body.i ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r12], #16 -; CHECK-NEXT: vaddt.f32 q0, q0, q1 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: vldrw.u32 q1, [r4], #16 +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %arm_mean_f32_mve.exit ; CHECK-NEXT: vmov s4, r1 +; CHECK-NEXT: dls lr, r12 ; CHECK-NEXT: vadd.f32 s0, s3, s3 ; CHECK-NEXT: mov r3, r1 ; CHECK-NEXT: vcvt.f32.u32 s4, s4 -; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: vdiv.f32 s0, s0, s4 -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB0_3: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -42,7 +38,7 @@ ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 -; CHECK-NEXT: vsubt.f32 q1, q1, r12 +; CHECK-NEXT: vsubt.f32 q1, q1, r4 ; CHECK-NEXT: vfmat.f32 q0, q1, q1 ; CHECK-NEXT: le lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %do.end diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir @@ -18,13 +18,13 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ] %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ] %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>* %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) @@ -50,7 +50,7 @@ } declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #1 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 @@ -169,7 +169,7 @@ renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg $r12 = t2MOVr killed $r3, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir @@ -18,13 +18,13 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ] %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ] %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>* %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) @@ -50,7 +50,7 @@ } declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #1 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 @@ -168,7 +168,7 @@ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr $r12 = t2MOVr killed $r3, 14, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-block-cond-iter-count.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-block-cond-iter-count.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-block-cond-iter-count.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-block-cond-iter-count.mir @@ -36,17 +36,17 @@ br i1 %26, label %49, label %31 31: ; preds = %23 - call void @llvm.set.loop.iterations.i32(i32 %30) + %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %30) br label %65 32: ; preds = %11 - call void @llvm.set.loop.iterations.i32(i32 %22) + %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %22) br label %33 33: ; preds = %33, %32 %34 = phi i32* [ %46, %33 ], [ %0, %32 ] %35 = phi i32* [ %45, %33 ], [ %1, %32 ] - %36 = phi i32 [ %22, %32 ], [ %47, %33 ] + %36 = phi i32 [ %start2, %32 ], [ %47, %33 ] %37 = phi i32 [ %9, %32 ], [ %41, %33 ] %38 = bitcast i32* %34 to <4 x i32>* %39 = bitcast i32* %35 to <4 x i32>* @@ -89,7 +89,7 @@ 65: ; preds = %65, %31 %66 = phi i32 [ %108, %65 ], [ 0, %31 ] %67 = phi i32 [ 0, %31 ], [ %107, %65 ] - %68 = phi i32 [ %30, %31 ], [ %109, %65 ] + %68 = phi i32 [ %start1, %31 ], [ %109, %65 ] %69 = bitcast i32* %0 to i8* %70 = bitcast i32* %1 to i8* %71 = getelementptr i8, i8* %70, i32 %66 @@ -141,7 +141,7 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 @@ -353,7 +353,7 @@ renamable $r2, dead $cpsr = tMOVi8 1, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg $r2 = tMOVr $r0, 14, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.3 (%ir-block.33): successors: %bb.3(0x7c000000), %bb.4(0x04000000) @@ -402,7 +402,7 @@ renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14, $noreg, $noreg renamable $r2, dead $cpsr = tMOVi8 0, 14, $noreg renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.8 (%ir-block.65): successors: %bb.8(0x7c000000), %bb.9(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multi-cond-iter-count.mir @@ -18,13 +18,13 @@ br i1 %10, label %34, label %17 17: ; preds = %4 - call void @llvm.set.loop.iterations.i32(i32 %16) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %16) br label %18 18: ; preds = %18, %17 %19 = phi i32* [ %31, %18 ], [ %0, %17 ] %20 = phi i32* [ %30, %18 ], [ %1, %17 ] - %21 = phi i32 [ %16, %17 ], [ %32, %18 ] + %21 = phi i32 [ %start, %17 ], [ %32, %18 ] %22 = phi i32 [ %9, %17 ], [ %26, %18 ] %23 = bitcast i32* %19 to <4 x i32>* %24 = bitcast i32* %20 to <4 x i32>* @@ -45,7 +45,7 @@ } declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) @@ -143,7 +143,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg $r3 = tMOVr $r0, 14, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2 (%ir-block.18): successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir @@ -8,7 +8,7 @@ br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader for.body.preheader: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body for.cond.cleanup: ; preds = %for.end, %entry @@ -18,7 +18,7 @@ %lsr.iv4 = phi i32* [ %b, %for.body.preheader ], [ %scevgep5, %for.end ] %lsr.iv2 = phi i32* [ %c, %for.body.preheader ], [ %scevgep3, %for.end ] %lsr.iv1 = phi i32* [ %a, %for.body.preheader ], [ %scevgep, %for.end ] - %lsr.iv = phi i32 [ %N, %for.body.preheader ], [ %lsr.iv.next, %for.end ] + %lsr.iv = phi i32 [ %start, %for.body.preheader ], [ %lsr.iv.next, %for.end ] %size = call i32 @llvm.arm.space(i32 3072, i32 undef) %0 = load i32, i32* %lsr.iv4, align 4 %1 = load i32, i32* %lsr.iv2, align 4 @@ -46,7 +46,7 @@ declare i32 @llvm.arm.space(i32 immarg, i32) #0 ; Function Attrs: noduplicate nounwind - declare void @llvm.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.start.loop.iterations.i32(i32) #1 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 @@ -166,7 +166,7 @@ liveins: $r0, $r1, $r2, $r3, $r4, $lr $lr = tMOVr $r3, 14, $noreg - t2DoLoopStart killed $r3 + $lr = t2DoLoopStart killed $r3 tB %bb.2, 14, $noreg bb.2.for.end: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir @@ -14,14 +14,14 @@ br i1 %cmp30, label %for.cond.cleanup6, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv68 = phi i32* [ %scevgep69, %vector.body ], [ %a, %vector.ph ] %lsr.iv65 = phi i32* [ %scevgep66, %vector.body ], [ %c, %vector.ph ] %lsr.iv62 = phi i32* [ %scevgep63, %vector.body ], [ %b, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %6 = phi i32 [ %start1, %vector.ph ], [ %11, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv6870 = bitcast i32* %lsr.iv68 to <4 x i32>* %lsr.iv6567 = bitcast i32* %lsr.iv65 to <4 x i32>* @@ -50,14 +50,14 @@ br i1 %13, label %for.cond.cleanup6, label %vector.ph39 vector.ph39: ; preds = %for.cond4.preheader - call void @llvm.set.loop.iterations.i32(i32 %19) + %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %19) br label %vector.body38 vector.body38: ; preds = %vector.body38, %vector.ph39 %lsr.iv59 = phi i32* [ %scevgep60, %vector.body38 ], [ %a, %vector.ph39 ] %lsr.iv56 = phi i32* [ %scevgep57, %vector.body38 ], [ %c, %vector.ph39 ] %lsr.iv = phi i32* [ %scevgep, %vector.body38 ], [ %b, %vector.ph39 ] - %20 = phi i32 [ %19, %vector.ph39 ], [ %26, %vector.body38 ] + %20 = phi i32 [ %start2, %vector.ph39 ], [ %26, %vector.body38 ] %21 = phi i32 [ %N, %vector.ph39 ], [ %23, %vector.body38 ] %lsr.iv5961 = bitcast i32* %lsr.iv59 to <4 x i32>* %lsr.iv5658 = bitcast i32* %lsr.iv56 to <4 x i32>* @@ -94,14 +94,14 @@ br i1 %cmp30, label %for.cond4.preheader, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv68 = phi i32* [ %scevgep69, %vector.body ], [ %a, %vector.ph ] %lsr.iv65 = phi i32* [ %scevgep66, %vector.body ], [ %c, %vector.ph ] %lsr.iv62 = phi i32* [ %scevgep63, %vector.body ], [ %b, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %6 = phi i32 [ %start1, %vector.ph ], [ %11, %vector.body ] %7 = phi i32 [ %div, %vector.ph ], [ %9, %vector.body ] %lsr.iv6870 = bitcast i32* %lsr.iv68 to <4 x i32>* %lsr.iv6567 = bitcast i32* %lsr.iv65 to <4 x i32>* @@ -130,14 +130,14 @@ br i1 %cmp528, label %for.cond.cleanup6, label %vector.ph39 vector.ph39: ; preds = %for.cond4.preheader - call void @llvm.set.loop.iterations.i32(i32 %18) + %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %18) br label %vector.body38 vector.body38: ; preds = %vector.body38, %vector.ph39 %lsr.iv59 = phi i32* [ %scevgep60, %vector.body38 ], [ %a, %vector.ph39 ] %lsr.iv56 = phi i32* [ %scevgep57, %vector.body38 ], [ %c, %vector.ph39 ] %lsr.iv = phi i32* [ %scevgep, %vector.body38 ], [ %b, %vector.ph39 ] - %19 = phi i32 [ %18, %vector.ph39 ], [ %25, %vector.body38 ] + %19 = phi i32 [ %start2, %vector.ph39 ], [ %25, %vector.body38 ] %20 = phi i32 [ %N, %vector.ph39 ], [ %22, %vector.body38 ] %lsr.iv5961 = bitcast i32* %lsr.iv59 to <4 x i32>* %lsr.iv5658 = bitcast i32* %lsr.iv56 to <4 x i32>* @@ -173,14 +173,14 @@ br i1 %cmp54, label %for.cond.cleanup17, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv123 = phi i32* [ %scevgep124, %vector.body ], [ %a, %vector.ph ] %lsr.iv120 = phi i32* [ %scevgep121, %vector.body ], [ %c, %vector.ph ] %lsr.iv117 = phi i32* [ %scevgep118, %vector.body ], [ %b, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %6 = phi i32 [ %start1, %vector.ph ], [ %11, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv123125 = bitcast i32* %lsr.iv123 to <4 x i32>* %lsr.iv120122 = bitcast i32* %lsr.iv120 to <4 x i32>* @@ -210,14 +210,14 @@ br i1 %cmp552, label %for.cond15.preheader, label %vector.ph66 vector.ph66: ; preds = %for.cond4.preheader - call void @llvm.set.loop.iterations.i32(i32 %18) + %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %18) br label %vector.body65 vector.body65: ; preds = %vector.body65, %vector.ph66 %lsr.iv114 = phi i32* [ %scevgep115, %vector.body65 ], [ %a, %vector.ph66 ] %lsr.iv111 = phi i32* [ %scevgep112, %vector.body65 ], [ %c, %vector.ph66 ] %lsr.iv108 = phi i32* [ %scevgep109, %vector.body65 ], [ %b, %vector.ph66 ] - %19 = phi i32 [ %18, %vector.ph66 ], [ %25, %vector.body65 ] + %19 = phi i32 [ %start2, %vector.ph66 ], [ %25, %vector.body65 ] %20 = phi i32 [ %div, %vector.ph66 ], [ %22, %vector.body65 ] %lsr.iv114116 = bitcast i32* %lsr.iv114 to <4 x i32>* %lsr.iv111113 = bitcast i32* %lsr.iv111 to <4 x i32>* @@ -248,14 +248,14 @@ br i1 %27, label %for.cond.cleanup17, label %vector.ph85 vector.ph85: ; preds = %for.cond15.preheader - call void @llvm.set.loop.iterations.i32(i32 %33) + %start3 = call i32 @llvm.start.loop.iterations.i32(i32 %33) br label %vector.body84 vector.body84: ; preds = %vector.body84, %vector.ph85 %lsr.iv105 = phi i32* [ %scevgep106, %vector.body84 ], [ %a, %vector.ph85 ] %lsr.iv102 = phi i32* [ %scevgep103, %vector.body84 ], [ %c, %vector.ph85 ] %lsr.iv = phi i32* [ %scevgep, %vector.body84 ], [ %b, %vector.ph85 ] - %34 = phi i32 [ %33, %vector.ph85 ], [ %40, %vector.body84 ] + %34 = phi i32 [ %start3, %vector.ph85 ], [ %40, %vector.body84 ] %35 = phi i32 [ %N, %vector.ph85 ], [ %37, %vector.body84 ] %lsr.iv105107 = bitcast i32* %lsr.iv105 to <4 x i32>* %lsr.iv102104 = bitcast i32* %lsr.iv102 to <4 x i32>* @@ -280,7 +280,7 @@ } declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) @@ -431,7 +431,7 @@ $r4 = tMOVr $r3, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r6, renamable $r12, 19, 14, $noreg, $noreg $r6 = tMOVr $r1, 14, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -462,7 +462,7 @@ renamable $r6, dead $cpsr = tMOVi8 1, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r6, killed renamable $r12, 19, 14, $noreg, $noreg $r12 = tMOVr $r0, 14, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.5.vector.body38: successors: %bb.5(0x7c000000), %bb.6(0x04000000) @@ -637,7 +637,7 @@ renamable $r6, dead $cpsr = tSUBi8 killed renamable $r6, 4, 14, $noreg renamable $lr = nuw nsw t2ADDrs renamable $r12, killed renamable $r6, 19, 14, $noreg, $noreg $r6 = tMOVr $r2, 14, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -670,7 +670,7 @@ renamable $r6 = t2BICri killed renamable $r6, 3, 14, $noreg, $noreg renamable $r6, dead $cpsr = tSUBi8 killed renamable $r6, 4, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r6, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.5.vector.body38: successors: %bb.5(0x7c000000), %bb.6(0x04000000) @@ -878,7 +878,7 @@ $r4 = tMOVr $r3, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r6, renamable $r12, 19, 14, $noreg, $noreg $r6 = tMOVr $r1, 14, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -919,7 +919,7 @@ $r4 = tMOVr $r1, 14, $noreg renamable $lr = nuw nsw t2ADDrs renamable $r8, killed renamable $r6, 19, 14, $noreg, $noreg $r6 = tMOVr $r0, 14, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.5.vector.body65: successors: %bb.5(0x7c000000), %bb.6(0x04000000) @@ -952,7 +952,7 @@ renamable $lr = nuw nsw t2ADDrs killed renamable $r8, killed renamable $r12, 19, 14, $noreg, $noreg $r5 = tMOVr $r0, 14, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.8.vector.body84: successors: %bb.8(0x7c000000), %bb.9(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -92,9 +92,9 @@ ; CHECK-NEXT: sub.w r7, r12, #4 ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: add.w r7, r6, r7, lsr #2 ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB0_12: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16 @@ -311,9 +311,9 @@ ; CHECK-NEXT: sub.w r7, r12, #4 ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: add.w r7, r6, r7, lsr #2 ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB1_12: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16 @@ -530,9 +530,9 @@ ; CHECK-NEXT: sub.w r7, r12, #4 ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: add.w r7, r6, r7, lsr #2 ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB2_12: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16 @@ -680,9 +680,9 @@ ; CHECK-NEXT: sub.w r7, r12, #4 ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: add.w r7, r6, r7, lsr #2 ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16 @@ -889,10 +889,10 @@ ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: add.w lr, r5, r6, lsr #2 +; CHECK-NEXT: add.w r6, r5, r6, lsr #2 ; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r4], #16 @@ -906,11 +906,11 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB4_6: @ %for.body.preheader11 -; CHECK-NEXT: sub.w lr, r3, r12 +; CHECK-NEXT: sub.w r3, r3, r12 ; CHECK-NEXT: add.w r0, r0, r12, lsl #2 ; CHECK-NEXT: add.w r1, r1, r12, lsl #2 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB4_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r3, [r0], #4 @@ -994,10 +994,10 @@ ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: add.w lr, r5, r6, lsr #2 +; CHECK-NEXT: add.w r6, r5, r6, lsr #2 ; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr.w r9, [r4] @@ -1021,11 +1021,11 @@ ; CHECK-NEXT: cmp r12, r3 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader11 -; CHECK-NEXT: sub.w lr, r3, r12 +; CHECK-NEXT: sub.w r3, r3, r12 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r1] @@ -1111,10 +1111,10 @@ ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: add.w lr, r5, r6, lsr #2 +; CHECK-NEXT: add.w r6, r5, r6, lsr #2 ; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr.w r9, [r4] @@ -1138,11 +1138,11 @@ ; CHECK-NEXT: cmp r12, r3 ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: .LBB6_6: @ %for.body.preheader11 -; CHECK-NEXT: sub.w lr, r3, r12 +; CHECK-NEXT: sub.w r3, r3, r12 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB6_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r1] @@ -1228,10 +1228,10 @@ ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: add.w lr, r5, r6, lsr #2 +; CHECK-NEXT: add.w r6, r5, r6, lsr #2 ; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr.w r9, [r4] @@ -1255,11 +1255,11 @@ ; CHECK-NEXT: cmp r12, r3 ; CHECK-NEXT: beq .LBB7_8 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader11 -; CHECK-NEXT: sub.w lr, r3, r12 +; CHECK-NEXT: sub.w r3, r3, r12 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r1] @@ -1345,10 +1345,10 @@ ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: add.w lr, r5, r6, lsr #2 +; CHECK-NEXT: add.w r6, r5, r6, lsr #2 ; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u32 q0, [r5], #8 @@ -1377,11 +1377,11 @@ ; CHECK-NEXT: cmp r12, r3 ; CHECK-NEXT: beq .LBB8_8 ; CHECK-NEXT: .LBB8_6: @ %for.body.preheader13 -; CHECK-NEXT: sub.w lr, r3, r12 +; CHECK-NEXT: sub.w r3, r3, r12 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB8_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r3, [r1], #2 @@ -1476,9 +1476,9 @@ ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldr s0, .LCPI9_0 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: add.w lr, r3, r2, lsr #2 +; CHECK-NEXT: add.w r2, r3, r2, lsr #2 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: .LBB9_5: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, r0, r3 @@ -1633,9 +1633,9 @@ ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldr s0, .LCPI10_0 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: add.w lr, r3, r2, lsr #2 +; CHECK-NEXT: add.w r2, r3, r2, lsr #2 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: .LBB10_5: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, r0, r3 @@ -1790,10 +1790,10 @@ ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldr s0, .LCPI11_0 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: add.w lr, r3, r2, lsr #2 +; CHECK-NEXT: add.w r2, r3, r2, lsr #2 ; CHECK-NEXT: adds r3, r1, #4 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: adds r2, r0, #4 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB11_5: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh.w r4, [r3, #2] diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -15,9 +15,9 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 @@ -91,9 +91,9 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 @@ -167,9 +167,9 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 @@ -243,9 +243,9 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 @@ -319,9 +319,9 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 @@ -430,10 +430,10 @@ ; CHECK-NEXT: add.w r4, r3, #8 ; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: add.w lr, r6, r5, lsr #2 +; CHECK-NEXT: add.w r6, r6, r5, lsr #2 ; CHECK-NEXT: adds r5, r0, #3 +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: adds r6, r1, #1 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r8, [r5, #-3] @@ -624,8 +624,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} ; CHECK-NEXT: .LBB6_1: @ %vector.ph -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -732,10 +732,10 @@ ; CHECK-NEXT: add.w r4, r3, #8 ; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: add.w lr, r6, r5, lsr #2 +; CHECK-NEXT: add.w r6, r6, r5, lsr #2 ; CHECK-NEXT: adds r5, r0, #3 +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: adds r6, r1, #1 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r8, [r5, #-3] @@ -926,8 +926,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} ; CHECK-NEXT: .LBB8_1: @ %vector.ph -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -1034,10 +1034,10 @@ ; CHECK-NEXT: add.w r4, r3, #8 ; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: add.w lr, r6, r5, lsr #2 +; CHECK-NEXT: add.w r6, r6, r5, lsr #2 ; CHECK-NEXT: add.w r5, r0, #8 +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: add.w r6, r1, #8 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r8, [r5, #-8] @@ -1214,8 +1214,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB10_1: @ %vector.ph -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #8 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -12,47 +12,47 @@ ; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N_VEC]], -4 -; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[TT:%.*]] = add i32 [[N_VEC]], -4 +; CHECK-NEXT: [[TT1:%.*]] = lshr i32 [[TT]], 2 +; CHECK-NEXT: [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1 ; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] ; CHECK: for.cond1.preheader.us: ; CHECK-NEXT: [[I_025_US:%.*]] = phi i32 [ [[INC10_US:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i16*, i16** [[A:%.*]], i32 [[I_025_US]] -; CHECK-NEXT: [[TMP3:%.*]] = load i16*, i16** [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: [[TT3:%.*]] = load i16*, i16** [[ARRAYIDX_US]], align 4 ; CHECK-NEXT: [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[I_025_US]] ; CHECK-NEXT: [[ARRAYIDX8_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX8_US]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX8_PROMOTED_US]], i32 0 -; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: [[TT4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX8_PROMOTED_US]], i32 0 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TT2]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT15:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[INDEX]] +; CHECK-NEXT: [[TT6:%.*]] = getelementptr inbounds i16, i16* [[TT3]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) ; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef) -; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP11]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef) -; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD30]] to <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP12]], [[TMP9]] -; CHECK-NEXT: [[TMP14]] = add nsw <4 x i32> [[TMP13]], [[VEC_PHI]] +; CHECK-NEXT: [[TT8:%.*]] = bitcast i16* [[TT6]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TT8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef) +; CHECK-NEXT: [[TT9:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TT10:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TT11:%.*]] = bitcast i16* [[TT10]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TT11]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef) +; CHECK-NEXT: [[TT12:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD30]] to <4 x i32> +; CHECK-NEXT: [[TT13:%.*]] = mul nsw <4 x i32> [[TT12]], [[TT9]] +; CHECK-NEXT: [[TT14]] = add nsw <4 x i32> [[TT13]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP5]], i32 1) -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] +; CHECK-NEXT: [[TT15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TT5]], i32 1) +; CHECK-NEXT: [[TT16:%.*]] = icmp ne i32 [[TT15]], 0 +; CHECK-NEXT: br i1 [[TT16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP14]], <4 x i32> [[VEC_PHI]] -; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]]) -; CHECK-NEXT: store i32 [[TMP18]], i32* [[ARRAYIDX8_US]], align 4 +; CHECK-NEXT: [[TT17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TT14]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TT18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TT17]]) +; CHECK-NEXT: store i32 [[TT18]], i32* [[ARRAYIDX8_US]], align 4 ; CHECK-NEXT: [[INC10_US]] = add nuw i32 [[I_025_US]], 1 ; CHECK-NEXT: [[EXITCOND27:%.*]] = icmp eq i32 [[INC10_US]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND27]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]] @@ -69,51 +69,51 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert28 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat29 = shufflevector <4 x i32> %broadcast.splatinsert28, <4 x i32> undef, <4 x i32> zeroinitializer - %tmp = add i32 %n.vec, -4 - %tmp1 = lshr i32 %tmp, 2 - %tmp2 = add nuw nsw i32 %tmp1, 1 + %tt = add i32 %n.vec, -4 + %tt1 = lshr i32 %tt, 2 + %tt2 = add nuw nsw i32 %tt1, 1 br label %for.cond1.preheader.us for.cond1.preheader.us: ; preds = %middle.block, %for.cond1.preheader.us.preheader %i.025.us = phi i32 [ %inc10.us, %middle.block ], [ 0, %for.cond1.preheader.us.preheader ] %arrayidx.us = getelementptr inbounds i16*, i16** %A, i32 %i.025.us - %tmp3 = load i16*, i16** %arrayidx.us, align 4 + %tt3 = load i16*, i16** %arrayidx.us, align 4 %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.025.us %arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4 - %tmp4 = insertelement <4 x i32> , i32 %arrayidx8.promoted.us, i32 0 - call void @llvm.set.loop.iterations.i32(i32 %tmp2) + %tt4 = insertelement <4 x i32> , i32 %arrayidx8.promoted.us, i32 0 + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tt2) br label %vector.body vector.body: ; preds = %vector.body, %for.cond1.preheader.us %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ] - %vec.phi = phi <4 x i32> [ %tmp4, %for.cond1.preheader.us ], [ %tmp14, %vector.body ] - %tmp5 = phi i32 [ %tmp2, %for.cond1.preheader.us ], [ %tmp15, %vector.body ] + %vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt14, %vector.body ] + %tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %tmp6 = getelementptr inbounds i16, i16* %tmp3, i32 %index + %tt6 = getelementptr inbounds i16, i16* %tt3, i32 %index - ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat29 - %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) + ; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat29 + %tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %tmp8 = bitcast i16* %tmp6 to <4 x i16>* - %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp8, i32 2, <4 x i1> %tmp7, <4 x i16> undef) - %tmp9 = sext <4 x i16> %wide.masked.load to <4 x i32> - %tmp10 = getelementptr inbounds i16, i16* %B, i32 %index - %tmp11 = bitcast i16* %tmp10 to <4 x i16>* - %wide.masked.load30 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp11, i32 2, <4 x i1> %tmp7, <4 x i16> undef) - %tmp12 = sext <4 x i16> %wide.masked.load30 to <4 x i32> - %tmp13 = mul nsw <4 x i32> %tmp12, %tmp9 - %tmp14 = add nsw <4 x i32> %tmp13, %vec.phi + %tt8 = bitcast i16* %tt6 to <4 x i16>* + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tt8, i32 2, <4 x i1> %tt7, <4 x i16> undef) + %tt9 = sext <4 x i16> %wide.masked.load to <4 x i32> + %tt10 = getelementptr inbounds i16, i16* %B, i32 %index + %tt11 = bitcast i16* %tt10 to <4 x i16>* + %wide.masked.load30 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tt11, i32 2, <4 x i1> %tt7, <4 x i16> undef) + %tt12 = sext <4 x i16> %wide.masked.load30 to <4 x i32> + %tt13 = mul nsw <4 x i32> %tt12, %tt9 + %tt14 = add nsw <4 x i32> %tt13, %vec.phi %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp5, i32 1) - %tmp16 = icmp ne i32 %tmp15, 0 - br i1 %tmp16, label %vector.body, label %middle.block + %tt15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tt5, i32 1) + %tt16 = icmp ne i32 %tt15, 0 + br i1 %tt16, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %tmp17 = select <4 x i1> %tmp7, <4 x i32> %tmp14, <4 x i32> %vec.phi - %tmp18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp17) - store i32 %tmp18, i32* %arrayidx8.us, align 4 + %tt17 = select <4 x i1> %tt7, <4 x i32> %tt14, <4 x i32> %vec.phi + %tt18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tt17) + store i32 %tt18, i32* %arrayidx8.us, align 4 %inc10.us = add nuw i32 %i.025.us, 1 %exitcond27 = icmp eq i32 %inc10.us, %N br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us @@ -133,45 +133,45 @@ ; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT27]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N_VEC]], -4 -; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[TT:%.*]] = add i32 [[N_VEC]], -4 +; CHECK-NEXT: [[TT1:%.*]] = lshr i32 [[TT]], 2 +; CHECK-NEXT: [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1 ; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] ; CHECK: for.cond1.preheader.us: ; CHECK-NEXT: [[I_024_US:%.*]] = phi i32 [ [[INC9_US:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i32 [[I_024_US]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: [[TT3:%.*]] = load i32*, i32** [[ARRAYIDX_US]], align 4 ; CHECK-NEXT: [[ARRAYIDX7_US:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[I_024_US]] ; CHECK-NEXT: [[ARRAYIDX7_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX7_US]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX7_PROMOTED_US]], i32 0 -; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: [[TT4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX7_PROMOTED_US]], i32 0 +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TT2]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT13:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 [[INDEX]] +; CHECK-NEXT: [[TT6:%.*]] = getelementptr inbounds i32, i32* [[TT3]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) ; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) -; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[TMP12]] = add nsw <4 x i32> [[VEC_PHI]], [[TMP11]] +; CHECK-NEXT: [[TT8:%.*]] = bitcast i32* [[TT6]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TT8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TT9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TT10:%.*]] = bitcast i32* [[TT9]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TT10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TT11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TT12]] = add nsw <4 x i32> [[VEC_PHI]], [[TT11]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP13]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP5]], i32 1) -; CHECK-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] +; CHECK-NEXT: [[TT13]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TT5]], i32 1) +; CHECK-NEXT: [[TT14:%.*]] = icmp ne i32 [[TT13]], 0 +; CHECK-NEXT: br i1 [[TT14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP12]], <4 x i32> [[VEC_PHI]] -; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) -; CHECK-NEXT: store i32 [[TMP16]], i32* [[ARRAYIDX7_US]], align 4 +; CHECK-NEXT: [[TT15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TT12]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TT16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TT15]]) +; CHECK-NEXT: store i32 [[TT16]], i32* [[ARRAYIDX7_US]], align 4 ; CHECK-NEXT: [[INC9_US]] = add nuw i32 [[I_024_US]], 1 ; CHECK-NEXT: [[EXITCOND26:%.*]] = icmp eq i32 [[INC9_US]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND26]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]] @@ -188,49 +188,49 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert27 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat28 = shufflevector <4 x i32> %broadcast.splatinsert27, <4 x i32> undef, <4 x i32> zeroinitializer - %tmp = add i32 %n.vec, -4 - %tmp1 = lshr i32 %tmp, 2 - %tmp2 = add nuw nsw i32 %tmp1, 1 + %tt = add i32 %n.vec, -4 + %tt1 = lshr i32 %tt, 2 + %tt2 = add nuw nsw i32 %tt1, 1 br label %for.cond1.preheader.us for.cond1.preheader.us: ; preds = %middle.block, %for.cond1.preheader.us.preheader %i.024.us = phi i32 [ %inc9.us, %middle.block ], [ 0, %for.cond1.preheader.us.preheader ] %arrayidx.us = getelementptr inbounds i32*, i32** %A, i32 %i.024.us - %tmp3 = load i32*, i32** %arrayidx.us, align 4 + %tt3 = load i32*, i32** %arrayidx.us, align 4 %arrayidx7.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us %arrayidx7.promoted.us = load i32, i32* %arrayidx7.us, align 4 - %tmp4 = insertelement <4 x i32> , i32 %arrayidx7.promoted.us, i32 0 - call void @llvm.set.loop.iterations.i32(i32 %tmp2) + %tt4 = insertelement <4 x i32> , i32 %arrayidx7.promoted.us, i32 0 + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tt2) br label %vector.body vector.body: ; preds = %vector.body, %for.cond1.preheader.us %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ] - %vec.phi = phi <4 x i32> [ %tmp4, %for.cond1.preheader.us ], [ %tmp12, %vector.body ] - %tmp5 = phi i32 [ %tmp2, %for.cond1.preheader.us ], [ %tmp13, %vector.body ] + %vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt12, %vector.body ] + %tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt13, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, - %tmp6 = getelementptr inbounds i32, i32* %tmp3, i32 %index + %tt6 = getelementptr inbounds i32, i32* %tt3, i32 %index - ; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat28 - %tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) + ; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat28 + %tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) - %tmp8 = bitcast i32* %tmp6 to <4 x i32>* - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp7, <4 x i32> undef) - %tmp9 = getelementptr inbounds i32, i32* %B, i32 %index - %tmp10 = bitcast i32* %tmp9 to <4 x i32>* - %wide.masked.load29 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp10, i32 4, <4 x i1> %tmp7, <4 x i32> undef) - %tmp11 = mul nsw <4 x i32> %wide.masked.load29, %wide.masked.load - %tmp12 = add nsw <4 x i32> %vec.phi, %tmp11 + %tt8 = bitcast i32* %tt6 to <4 x i32>* + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tt8, i32 4, <4 x i1> %tt7, <4 x i32> undef) + %tt9 = getelementptr inbounds i32, i32* %B, i32 %index + %tt10 = bitcast i32* %tt9 to <4 x i32>* + %wide.masked.load29 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tt10, i32 4, <4 x i1> %tt7, <4 x i32> undef) + %tt11 = mul nsw <4 x i32> %wide.masked.load29, %wide.masked.load + %tt12 = add nsw <4 x i32> %vec.phi, %tt11 %index.next = add i32 %index, 4 - %tmp13 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp5, i32 1) - %tmp14 = icmp ne i32 %tmp13, 0 - br i1 %tmp14, label %vector.body, label %middle.block + %tt13 = call i32 @llvm.loop.decrement.reg.i32(i32 %tt5, i32 1) + %tt14 = icmp ne i32 %tt13, 0 + br i1 %tt14, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %tmp15 = select <4 x i1> %tmp7, <4 x i32> %tmp12, <4 x i32> %vec.phi - %tmp16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp15) - store i32 %tmp16, i32* %arrayidx7.us, align 4 + %tt15 = select <4 x i1> %tt7, <4 x i32> %tt12, <4 x i32> %vec.phi + %tt16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tt15) + store i32 %tt16, i32* %arrayidx7.us, align 4 %inc9.us = add nuw i32 %i.024.us, 1 %exitcond26 = icmp eq i32 %inc9.us, %N br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us @@ -250,7 +250,7 @@ declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1 ; Function Attrs: noduplicate nounwind -declare void @llvm.set.loop.iterations.i32(i32) #2 +declare i32 @llvm.start.loop.iterations.i32(i32) #2 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir @@ -13,11 +13,11 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ] @@ -49,7 +49,7 @@ } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 @@ -152,7 +152,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir @@ -14,7 +14,7 @@ br i1 %cmp11, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) %6 = shl i32 %4, 3 %7 = sub i32 %N, %6 br label %vector.body @@ -23,7 +23,7 @@ %lsr.iv20 = phi i8* [ %scevgep21, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %13, %vector.body ] - %8 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ] + %8 = phi i32 [ %start, %vector.ph ], [ %14, %vector.body ] %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ] %lsr.iv2022 = bitcast i8* %lsr.iv20 to <16 x i8>* %lsr.iv19 = bitcast i8* %lsr.iv to <16 x i8>* @@ -54,7 +54,7 @@ declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <16 x i1> @llvm.arm.mve.vctp8(i32) #4 @@ -180,7 +180,7 @@ renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r12, 35, 14, $noreg, $noreg renamable $r3 = t2LSRri killed renamable $r12, 4, 14, $noreg, $noreg renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 34, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-store.mir @@ -14,14 +14,14 @@ br i1 %cmp10, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv19 = phi i8* [ %scevgep20, %vector.body ], [ %res, %vector.ph ] %lsr.iv16 = phi i8* [ %scevgep17, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv1921 = bitcast i8* %lsr.iv19 to <16 x i8>* %lsr.iv1618 = bitcast i8* %lsr.iv16 to <16 x i8>* @@ -45,7 +45,7 @@ declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <16 x i1> @llvm.arm.mve.vctp8(i32) @@ -155,7 +155,7 @@ renamable $r12 = t2BICri killed renamable $r12, 15, 14, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r12, 16, 14, $noreg, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 35, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-invariant.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-invariant.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-invariant.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-invariant.mir @@ -14,11 +14,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>* @@ -39,7 +39,7 @@ } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) @@ -123,7 +123,7 @@ renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r1, 19, 14 /* CC::al */, $noreg, $noreg renamable $r1 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 $r1 = tMOVr killed $r3, 14 /* CC::al */, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir @@ -14,14 +14,14 @@ br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv14 = phi i8* [ %scevgep15, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %15, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv13 = bitcast i8* %lsr.iv to <4 x i8>* %lsr.iv1416 = bitcast i8* %lsr.iv14 to <4 x i8>* @@ -61,14 +61,14 @@ br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv15 = phi i8* [ %scevgep16, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %15, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv14 = bitcast i8* %lsr.iv to <4 x i8>* %lsr.iv1517 = bitcast i8* %lsr.iv15 to <4 x i8>* @@ -108,14 +108,14 @@ br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv14 = phi i16* [ %scevgep15, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %15, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv13 = bitcast i16* %lsr.iv to <4 x i16>* %lsr.iv1416 = bitcast i16* %lsr.iv14 to <4 x i16>* @@ -155,14 +155,14 @@ br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv15 = phi i16* [ %scevgep16, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %15, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv14 = bitcast i16* %lsr.iv to <4 x i16>* %lsr.iv1517 = bitcast i16* %lsr.iv15 to <4 x i16>* @@ -203,14 +203,14 @@ br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv13 = phi i32* [ %scevgep14, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv12 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1315 = bitcast i32* %lsr.iv13 to <4 x i32>* @@ -249,14 +249,14 @@ br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %13, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* @@ -286,7 +286,7 @@ declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) @@ -372,7 +372,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body (align 4): successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -478,7 +478,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body (align 4): successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -585,7 +585,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body (align 4): successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -691,7 +691,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body (align 4): successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -797,7 +797,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body (align 4): successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -903,7 +903,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body (align 4): successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -69,26 +69,26 @@ ; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #3 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #3 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q0, [r0], #8 ; CHECK-NEXT: subs r2, #8 -; CHECK-NEXT: vadd.i16 q1, q0, q1 +; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, q2 +; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -142,25 +142,25 @@ ; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: add.w r3, r2, #15 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #15 ; CHECK-NEXT: sub.w r12, r3, #16 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #4 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #4 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.8 r2 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u8 q1, [r1], #16 +; CHECK-NEXT: vldrbt.u8 q0, [r1], #16 ; CHECK-NEXT: vldrbt.u8 q2, [r0], #16 ; CHECK-NEXT: subs r2, #16 -; CHECK-NEXT: vsub.i8 q1, q2, q1 -; CHECK-NEXT: vadd.i8 q1, q1, q0 +; CHECK-NEXT: vsub.i8 q0, q2, q0 +; CHECK-NEXT: vadd.i8 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: uxtb r0, r0 @@ -212,25 +212,25 @@ ; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #3 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #3 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q0, [r0], #8 ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 ; CHECK-NEXT: subs r2, #8 -; CHECK-NEXT: vsub.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q1, q1, q0 +; CHECK-NEXT: vsub.i16 q0, q2, q0 +; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -284,25 +284,25 @@ ; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: add.w r3, r2, #15 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #15 ; CHECK-NEXT: sub.w r12, r3, #16 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #4 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #4 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.8 r2 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u8 q1, [r0], #16 +; CHECK-NEXT: vldrbt.u8 q0, [r0], #16 ; CHECK-NEXT: vldrbt.u8 q2, [r1], #16 ; CHECK-NEXT: subs r2, #16 -; CHECK-NEXT: vmul.i8 q1, q2, q1 -; CHECK-NEXT: vadd.i8 q1, q1, q0 +; CHECK-NEXT: vmul.i8 q0, q2, q0 +; CHECK-NEXT: vadd.i8 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB4_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: uxtb r0, r0 @@ -354,25 +354,25 @@ ; CHECK-NEXT: .LBB5_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #3 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #3 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q0, [r0], #8 ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 ; CHECK-NEXT: subs r2, #8 -; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q1, q1, q0 +; CHECK-NEXT: vmul.i16 q0, q2, q0 +; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -423,36 +423,36 @@ ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: subs r6, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: add.w lr, r3, r6, lsr #2 +; CHECK-NEXT: add.w r3, r3, r6, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u32 q1, [r4], #4 +; CHECK-NEXT: vldrbt.u32 q0, [r4], #4 ; CHECK-NEXT: vldrbt.u32 q2, [r5], #4 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vmul.i32 q0, q2, q0 +; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB6_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r12, q0 ; CHECK-NEXT: cbz r2, .LBB6_7 ; CHECK-NEXT: @ %bb.4: @ %vector.ph47 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r6, lsr #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: add.w r3, r3, r6, lsr #2 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: vdup.32 q0, r6 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: vmov.32 q0[0], r12 ; CHECK-NEXT: .LBB6_5: @ %vector.body46 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -550,32 +550,32 @@ ; CHECK-NEXT: cbz r2, .LBB7_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: bic r3, r3, #7 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: add.w lr, r4, r3, lsr #3 -; CHECK-NEXT: mov r3, r0 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: add.w r3, r4, r3, lsr #3 ; CHECK-NEXT: mov r4, r1 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u16 q1, [r3], #8 +; CHECK-NEXT: vldrbt.u16 q0, [r3], #8 ; CHECK-NEXT: vldrbt.u16 q4, [r4], #8 ; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vsub.i16 q3, q4, q1 -; CHECK-NEXT: vmul.i16 q1, q4, q1 +; CHECK-NEXT: vsub.i16 q3, q4, q0 +; CHECK-NEXT: vmul.i16 q0, q4, q0 ; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vadd.i16 q3, q3, q2 -; CHECK-NEXT: vadd.i16 q1, q1, q0 +; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB7_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u16 r4, q2 ; CHECK-NEXT: vaddv.u16 r2, q0 ; CHECK-NEXT: b .LBB7_5 @@ -643,40 +643,40 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: movw r12, #47184 -; CHECK-NEXT: movw r3, #23593 ; CHECK-NEXT: ldrd r2, lr, [r1, #4] +; CHECK-NEXT: movw r1, #23593 ; CHECK-NEXT: movt r12, #1310 -; CHECK-NEXT: movt r3, #49807 -; CHECK-NEXT: mla r3, lr, r3, r12 -; CHECK-NEXT: movw r1, #55051 +; CHECK-NEXT: movt r1, #49807 +; CHECK-NEXT: mla r1, lr, r1, r12 +; CHECK-NEXT: movw r3, #55051 ; CHECK-NEXT: movw r4, #23593 -; CHECK-NEXT: movt r1, #163 +; CHECK-NEXT: movt r3, #163 ; CHECK-NEXT: ldr r0, [r0] ; CHECK-NEXT: movt r4, #655 -; CHECK-NEXT: ror.w r12, r3, #4 -; CHECK-NEXT: cmp r12, r1 -; CHECK-NEXT: cset r1, lo -; CHECK-NEXT: ror.w r3, r3, #2 +; CHECK-NEXT: ror.w r12, r1, #4 +; CHECK-NEXT: cmp r12, r3 +; CHECK-NEXT: cset r3, lo +; CHECK-NEXT: ror.w r1, r1, #2 ; CHECK-NEXT: mov.w r12, #1 -; CHECK-NEXT: cmp r3, r4 -; CHECK-NEXT: csel r3, r1, r12, lo +; CHECK-NEXT: cmp r1, r4 +; CHECK-NEXT: csel r1, r3, r12, lo ; CHECK-NEXT: lsls.w r4, lr, #30 -; CHECK-NEXT: csel r1, r1, r3, ne +; CHECK-NEXT: csel r3, r3, r1, ne ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB8_1: @ %vector.ph -; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: movs r4, #52 -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: add.w lr, r12, r3, lsr #2 -; CHECK-NEXT: movw r3, :lower16:days -; CHECK-NEXT: movt r3, :upper16:days -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: mla r1, r1, r4, r3 +; CHECK-NEXT: adds r1, r2, #3 +; CHECK-NEXT: bic r1, r1, #3 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: add.w r4, r12, r1, lsr #2 +; CHECK-NEXT: movw r12, :lower16:days +; CHECK-NEXT: movt r12, :upper16:days +; CHECK-NEXT: movs r1, #52 +; CHECK-NEXT: mla r1, r3, r1, r12 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll @@ -105,8 +105,8 @@ ; CHECK-NEXT: vmov.i32 q2, #0x1 ; CHECK-NEXT: add.w lr, r5, #3 ; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: add.w lr, r5, lr, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r5, r5, lr, lsr #2 +; CHECK-NEXT: dls lr, r5 ; CHECK-NEXT: .LBB1_1: @ %bb6 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r12 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir @@ -31,13 +31,13 @@ %ind.end17 = getelementptr float, float* %pDst, i32 %n.vec %scevgep9 = getelementptr float, float* %pDst, i32 -4 %scevgep14 = getelementptr float, float* %pSrc, i32 -4 - call void @llvm.set.loop.iterations.i32(i32 %4) + %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %4) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv15 = phi float* [ %scevgep16, %vector.body ], [ %scevgep14, %vector.ph ] %lsr.iv10 = phi float* [ %scevgep11, %vector.body ], [ %scevgep9, %vector.ph ] - %5 = phi i32 [ %4, %vector.ph ], [ %7, %vector.body ] + %5 = phi i32 [ %start1, %vector.ph ], [ %7, %vector.body ] %lsr.iv1517 = bitcast float* %lsr.iv15 to <4 x float>* %lsr.iv1012 = bitcast float* %lsr.iv10 to <4 x float>* %scevgep18 = getelementptr <4 x float>, <4 x float>* %lsr.iv1517, i32 1 @@ -61,13 +61,13 @@ %pDst.addr.06.ph = phi float* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end17, %middle.block ] %scevgep1 = getelementptr float, float* %pSrc.addr.07.ph, i32 -1 %scevgep4 = getelementptr float, float* %pDst.addr.06.ph, i32 -1 - call void @llvm.set.loop.iterations.i32(i32 %blkCnt.08.ph) + %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %blkCnt.08.ph) br label %while.body while.body: ; preds = %while.body, %while.body.preheader19 %lsr.iv5 = phi float* [ %scevgep6, %while.body ], [ %scevgep4, %while.body.preheader19 ] %lsr.iv = phi float* [ %scevgep2, %while.body ], [ %scevgep1, %while.body.preheader19 ] - %9 = phi i32 [ %blkCnt.08.ph, %while.body.preheader19 ], [ %12, %while.body ] + %9 = phi i32 [ %start2, %while.body.preheader19 ], [ %12, %while.body ] %scevgep3 = getelementptr float, float* %lsr.iv, i32 1 %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1 %10 = load float, float* %scevgep3, align 4 @@ -84,7 +84,7 @@ } declare float @llvm.fabs.f32(float) declare <4 x float> @llvm.fabs.v4f32(<4 x float>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) ... @@ -262,7 +262,7 @@ renamable $r7, dead $cpsr = tSUBrr renamable $r2, renamable $r4, 14, $noreg renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg renamable $r12 = t2ADDrs renamable $r0, renamable $r4, 18, 14, $noreg, $noreg - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 16, 14, $noreg $r5 = tMOVr killed $r3, 14, $noreg renamable $r3 = t2ADDrs renamable $r1, renamable $r4, 18, 14, $noreg, $noreg @@ -305,7 +305,7 @@ renamable $r0, dead $cpsr = tSUBi3 killed renamable $r3, 4, 14, $noreg renamable $r1 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.8.while.body: successors: %bb.8(0x7c000000), %bb.9(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir @@ -14,12 +14,12 @@ br i1 %cmp6, label %while.end, label %while.body.preheader while.body.preheader: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %n) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %n) br label %while.body while.body: ; preds = %while.body, %while.body.preheader %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ] - %0 = phi i32 [ %n, %while.body.preheader ], [ %1, %while.body ] + %0 = phi i32 [ %start, %while.body.preheader ], [ %1, %while.body ] %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() %add = add nsw i32 %call, %res.07 %1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) @@ -33,7 +33,7 @@ declare i32 @bar(...) local_unnamed_addr #0 - declare void @llvm.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.start.loop.iterations.i32(i32) #1 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 attributes #0 = { "target-features"="+mve.fp" } @@ -109,7 +109,7 @@ $lr = tMOVr $r0, 14, $noreg renamable $r4, dead $cpsr = tMOVi8 0, 14, $noreg - t2DoLoopStart killed $r0 + $lr = t2DoLoopStart killed $r0 bb.2.while.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-read.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-read.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-read.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-read.mir @@ -14,11 +14,11 @@ br i1 %cmp6, label %while.end, label %while.body.preheader while.body.preheader: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %n) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %n) br label %while.body while.body: ; preds = %while.body, %while.body.preheader - %0 = phi i32 [ %n, %while.body.preheader ], [ %1, %while.body ] + %0 = phi i32 [ %start, %while.body.preheader ], [ %1, %while.body ] %1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) %add = add i32 %1, 0 %2 = icmp ne i32 %1, 0 @@ -29,7 +29,7 @@ ret i32 %res.0.lcssa } - declare void @llvm.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.start.loop.iterations.i32(i32) #1 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 attributes #0 = { "target-features"="+mve.fp" } @@ -96,7 +96,7 @@ liveins: $r0 $lr = tMOVr $r0, 14, $noreg - t2DoLoopStart killed $r0 + $lr = t2DoLoopStart killed $r0 bb.2.while.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-write.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-write.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-write.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-write.mir @@ -14,11 +14,11 @@ br i1 %cmp6, label %while.end, label %while.body.preheader while.body.preheader: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %n) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %n) br label %while.body while.body: ; preds = %while.body, %while.body.preheader - %0 = phi i32 [ %n, %while.body.preheader ], [ %1, %while.body ] + %0 = phi i32 [ %start, %while.body.preheader ], [ %1, %while.body ] %1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) %add = add i32 %1, 2 %2 = icmp ne i32 %1, 0 @@ -30,7 +30,7 @@ } ; Function Attrs: noduplicate nounwind - declare void @llvm.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.start.loop.iterations.i32(i32) #1 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 @@ -102,7 +102,7 @@ liveins: $r0 $lr = tMOVr $r0, 14, $noreg - t2DoLoopStart killed $r0 + $lr = t2DoLoopStart killed $r0 bb.2.while.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-non-header.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-non-header.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-non-header.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-non-header.mir @@ -30,7 +30,7 @@ %gap.057 = sdiv i32 %gap.057.in, 2 %cmp252 = icmp slt i32 %gap.057, %n %tmp = sub i32 %n, %gap.057 - call void @llvm.set.loop.iterations.i32(i32 %tmp) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp) br i1 %cmp252, label %for.cond4.preheader.preheader, label %for.cond.loopexit for.cond4.preheader.preheader: ; preds = %for.cond1.preheader @@ -44,7 +44,7 @@ %lsr.iv2 = phi i32* [ %scevgep3, %for.inc16 ], [ %scevgep1, %for.cond4.preheader.preheader ] %lsr.iv = phi i32* [ %v, %for.cond4.preheader.preheader ], [ %scevgep, %for.inc16 ] %i.053 = phi i32 [ %inc, %for.inc16 ], [ %gap.057, %for.cond4.preheader.preheader ] - %tmp8 = phi i32 [ %tmp, %for.cond4.preheader.preheader ], [ %tmp16, %for.inc16 ] + %tmp8 = phi i32 [ %start, %for.cond4.preheader.preheader ], [ %tmp16, %for.inc16 ] %j.048 = sub nsw i32 %i.053, %gap.057 %cmp549 = icmp sgt i32 %j.048, -1 br i1 %cmp549, label %land.rhs.preheader, label %for.inc16 @@ -93,7 +93,7 @@ } ; Function Attrs: noduplicate nounwind - declare void @llvm.set.loop.iterations.i32(i32) #0 + declare i32 @llvm.start.loop.iterations.i32(i32) #0 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 @@ -208,7 +208,7 @@ renamable $lr = t2SUBrs renamable $r1, renamable $r2, 9, 14, $noreg, $noreg renamable $r9 = t2ASRri renamable $r2, 1, 14, $noreg, $noreg t2CMPrs renamable $r1, killed renamable $r2, 9, 14, $noreg, implicit-def $cpsr - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr tBcc %bb.2, 13, killed $cpsr bb.4.for.cond4.preheader.preheader: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir @@ -11,7 +11,7 @@ entry: %scevgep = getelementptr i32, i32* %q, i32 -1 %scevgep3 = getelementptr i32, i32* %p, i32 -1 - call void @llvm.set.loop.iterations.i32(i32 %n) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %n) br label %preheader preheader: @@ -20,7 +20,7 @@ while.body: ; preds = %while.body, %entry %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %preheader ] %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %preheader ] - %0 = phi i32 [ %n, %preheader ], [ %2, %while.body ] + %0 = phi i32 [ %start, %preheader ], [ %2, %while.body ] %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1 %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1 %1 = load i32, i32* %scevgep6, align 4 @@ -35,7 +35,7 @@ ret i32 0 } - declare void @llvm.set.loop.iterations.i32(i32) #0 + declare i32 @llvm.start.loop.iterations.i32(i32) #0 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 attributes #0 = { noduplicate nounwind } @@ -120,7 +120,7 @@ frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 - t2DoLoopStart $r0 + $lr = t2DoLoopStart $r0 renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir @@ -8,11 +8,11 @@ br i1 %cmp, label %exit, label %loop.ph loop.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %iters) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters) br label %loop.body loop.body: ; preds = %loop.body, %loop.ph - %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ] %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] @@ -43,11 +43,11 @@ br i1 %cmp, label %exit, label %loop.ph loop.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %iters) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters) br label %loop.body loop.body: ; preds = %loop.body, %loop.ph - %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ] %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] %addr.a = phi <8 x i16>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] %addr.b = phi <8 x i16>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] @@ -72,7 +72,7 @@ ret void } - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) @@ -160,7 +160,7 @@ liveins: $r0, $r1, $r2, $r3, $r4, $lr renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r4 + $lr = t2DoLoopStart renamable $r4 $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg bb.2.loop.body: @@ -261,7 +261,7 @@ liveins: $r0, $r1, $r2, $r3, $r4, $lr renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg bb.2.loop.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll @@ -15,29 +15,29 @@ ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 ; CHECK-NEXT: @ Child Loop BB0_5 Depth 2 -; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: .LBB0_3: @ %for.body4.us ; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh.w r6, [r0, r5, lsl #1] -; CHECK-NEXT: ldrh.w r7, [r1, r5, lsl #1] -; CHECK-NEXT: add r6, r7 -; CHECK-NEXT: strh.w r6, [r4, r5, lsl #1] -; CHECK-NEXT: adds r5, #1 +; CHECK-NEXT: ldrh.w r5, [r0, r6, lsl #1] +; CHECK-NEXT: ldrh.w r7, [r1, r6, lsl #1] +; CHECK-NEXT: add r5, r7 +; CHECK-NEXT: strh.w r5, [r4, r6, lsl #1] +; CHECK-NEXT: adds r6, #1 ; CHECK-NEXT: le lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %for.body15.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: .LBB0_5: @ %for.body15.us ; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh.w r7, [r0, r5, lsl #1] -; CHECK-NEXT: ldrh.w r6, [r1, r5, lsl #1] -; CHECK-NEXT: add r6, r7 -; CHECK-NEXT: strh.w r6, [r2, r5, lsl #1] -; CHECK-NEXT: adds r5, #1 +; CHECK-NEXT: ldrh.w r7, [r0, r6, lsl #1] +; CHECK-NEXT: ldrh.w r5, [r1, r6, lsl #1] +; CHECK-NEXT: add r5, r7 +; CHECK-NEXT: strh.w r5, [r2, r6, lsl #1] +; CHECK-NEXT: adds r6, #1 ; CHECK-NEXT: le lr, .LBB0_5 ; CHECK-NEXT: @ %bb.6: @ %for.cond.cleanup14.us ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir @@ -16,7 +16,7 @@ %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body for.cond.cleanup: ; preds = %for.body, %entry @@ -26,7 +26,7 @@ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] - %0 = phi i32 [ %N, %for.body.preheader ], [ %3, %for.body ] + %0 = phi i32 [ %start, %for.body.preheader ], [ %3, %for.body ] %size = call i32 @llvm.arm.space(i32 4070, i32 undef) %scevgep3 = getelementptr i32, i32* %lsr.iv9, i32 1 %1 = load i32, i32* %scevgep3, align 4 @@ -47,7 +47,7 @@ declare i32 @llvm.arm.space(i32 immarg, i32) #0 ; Function Attrs: noduplicate nounwind - declare void @llvm.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.start.loop.iterations.i32(i32) #1 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 @@ -155,7 +155,7 @@ renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg $lr = tMOVr $r3, 14, $noreg - t2DoLoopStart killed $r3 + $lr = t2DoLoopStart killed $r3 bb.2.for.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir @@ -20,7 +20,7 @@ vector.ph: ; preds = %entry %7 = insertelement <4 x i32> , i32 %0, i32 0, !dbg !32 - call void @llvm.set.loop.iterations.i32(i32 %6), !dbg !32 + %start = call i32 @llvm.start.loop.iterations.i32(i32 %6), !dbg !32 %8 = shl i32 %5, 2, !dbg !32 %9 = sub i32 %N, %8, !dbg !32 br label %vector.body, !dbg !32 @@ -28,7 +28,7 @@ vector.body: ; preds = %vector.body, %vector.ph %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %b, %vector.ph ], !dbg !33 %vec.phi = phi <4 x i32> [ %7, %vector.ph ], [ %15, %vector.body ] - %10 = phi i32 [ %6, %vector.ph ], [ %16, %vector.body ] + %10 = phi i32 [ %start, %vector.ph ], [ %16, %vector.body ] %11 = phi i32 [ %N, %vector.ph ], [ %13, %vector.body ] %lsr.iv14 = bitcast i16* %lsr.iv to <4 x i16>* %12 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %11), !dbg !34 @@ -59,7 +59,7 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) @@ -261,7 +261,7 @@ renamable $lr = nuw nsw t2ADDrs killed renamable $lr, renamable $r3, 19, 14, $noreg, $noreg, debug-location !32 renamable $r3, dead $cpsr = tLSRri killed renamable $r3, 2, 14, $noreg, debug-location !32 renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 18, 14, $noreg, $noreg, debug-location !32 - t2DoLoopStart renamable $lr, debug-location !32 + $lr = t2DoLoopStart renamable $lr, debug-location !32 bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/switch.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/switch.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/switch.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/switch.mir @@ -17,7 +17,7 @@ br i1 %cmp11, label %for.cond.cleanup, label %for.body.preheader for.body.preheader: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %for.body for.cond.cleanup: ; preds = %for.inc, %entry @@ -30,7 +30,7 @@ %lsr.iv1 = phi i8* [ %c, %for.body.preheader ], [ %scevgep, %for.inc ] %spaces.013 = phi i32 [ %spaces.1, %for.inc ], [ 0, %for.body.preheader ] %found.012 = phi i32 [ %found.1, %for.inc ], [ 0, %for.body.preheader ] - %0 = phi i32 [ %N, %for.body.preheader ], [ %3, %for.inc ] + %0 = phi i32 [ %start, %for.body.preheader ], [ %3, %for.inc ] %1 = load i8, i8* %lsr.iv1, align 1 %2 = zext i8 %1 to i32 switch i32 %2, label %for.inc [ @@ -58,7 +58,7 @@ } ; Function Attrs: noduplicate nounwind - declare void @llvm.set.loop.iterations.i32(i32) #0 + declare i32 @llvm.start.loop.iterations.i32(i32) #0 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 @@ -130,7 +130,7 @@ liveins: $r0, $r1 $lr = tMOVr $r1, 14, $noreg - t2DoLoopStart killed $r1 + $lr = t2DoLoopStart killed $r1 renamable $r1, dead $cpsr = tMOVi8 0, 14, $noreg renamable $r12 = t2MOVi 1, 14, $noreg, $noreg renamable $r2, dead $cpsr = tMOVi8 0, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll @@ -25,12 +25,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer %induction = or <16 x i32> %broadcast.splat, @@ -82,12 +82,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, @@ -138,12 +138,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = or <4 x i32> %broadcast.splat, @@ -193,12 +193,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -252,12 +252,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -311,12 +311,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -374,7 +374,7 @@ %scevgep = getelementptr i32, i32* %A, i32 8 %scevgep30 = getelementptr i32, i32* %C, i32 8 %scevgep37 = getelementptr i32, i32* %B, i32 8 - call void @llvm.set.loop.iterations.i32(i32 %v5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %v5) br label %vector.body vector.body: @@ -382,7 +382,7 @@ %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ] %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ] - %v6 = phi i32 [ %v5, %vector.ph ], [ %v15, %vector.body ] + %v6 = phi i32 [ %start, %vector.ph ], [ %v15, %vector.body ] %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>* %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>* @@ -447,7 +447,7 @@ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -455,7 +455,7 @@ %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ] %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* @@ -496,7 +496,7 @@ vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -504,7 +504,7 @@ %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ] %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* @@ -547,7 +547,7 @@ vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -558,7 +558,7 @@ ; AddRec base is not 0: %index = phi i32 [ 1, %vector.ph ], [ %index.next, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ] %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* @@ -589,7 +589,7 @@ declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>) declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) -declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll @@ -4,14 +4,14 @@ define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 8001) +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* @@ -36,7 +36,7 @@ ; CHECK-NEXT: ret void ; entry: - call void @llvm.set.loop.iterations.i32(i32 8001) + %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) br label %vector.body vector.body: @@ -44,7 +44,7 @@ %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* @@ -77,13 +77,13 @@ define dso_local void @foo2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { ; CHECK-LABEL: @foo2( ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 2000) +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 2000) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 2000, [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* @@ -101,14 +101,14 @@ ; CHECK-NEXT: ret void ; entry: - call void @llvm.set.loop.iterations.i32(i32 2000) + %start = call i32 @llvm.start.loop.iterations.i32(i32 2000) br label %vector.body vector.body: %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] - %0 = phi i32 [ 2000, %entry ], [ %2, %vector.body ] + %0 = phi i32 [ %start, %entry ], [ %2, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* @@ -131,14 +131,14 @@ define dso_local void @foo3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { ; CHECK-LABEL: @foo3( ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 8001) +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* @@ -161,7 +161,7 @@ ; CHECK-NEXT: ret void ; entry: - call void @llvm.set.loop.iterations.i32(i32 8001) + %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) br label %vector.body vector.body: @@ -169,7 +169,7 @@ %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* @@ -199,14 +199,14 @@ define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { ; CHECK-LABEL: @foo5( ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 8001) +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 8001, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* @@ -229,7 +229,7 @@ ; CHECK-NEXT: ret void ; entry: - call void @llvm.set.loop.iterations.i32(i32 8001) + %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) br label %vector.body vector.body: @@ -237,7 +237,7 @@ %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* @@ -273,7 +273,7 @@ ; define dso_local void @inconsistent_tripcounts(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { entry: - call void @llvm.set.loop.iterations.i32(i32 8001) + %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) br label %vector.body vector.body: @@ -281,7 +281,7 @@ %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* @@ -316,7 +316,7 @@ ; define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { entry: - call void @llvm.set.loop.iterations.i32(i32 1073741824) + %start = call i32 @llvm.start.loop.iterations.i32(i32 1073741824) br label %vector.body vector.body: @@ -324,7 +324,7 @@ %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* @@ -359,7 +359,7 @@ ; define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { entry: - call void @llvm.set.loop.iterations.i32(i32 8001) + %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) br label %vector.body vector.body: @@ -367,7 +367,7 @@ %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* @@ -402,7 +402,7 @@ ; define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { entry: - call void @llvm.set.loop.iterations.i32(i32 8001) + %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) br label %vector.body vector.body: @@ -410,7 +410,7 @@ %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* @@ -448,7 +448,7 @@ ; define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { entry: - call void @llvm.set.loop.iterations.i32(i32 8001) + %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) br label %vector.body vector.body: @@ -456,7 +456,7 @@ %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ] + %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* @@ -502,7 +502,7 @@ %lsr.iv31 = phi i32* [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ] %lsr.iv = phi i32* [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ] %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ] - call void @llvm.set.loop.iterations.i32(i32 1025) + %start = call i32 @llvm.start.loop.iterations.i32(i32 1025) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -510,7 +510,7 @@ %lsr.iv33 = phi i32* [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ] %lsr.iv28 = phi i32* [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %0 = phi i32 [ 1025, %vector.ph ], [ %2, %vector.body ] + %0 = phi i32 [ %start, %vector.ph ], [ %2, %vector.body ] %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>* %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>* @@ -546,5 +546,5 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 ) -declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.start.loop.iterations.i32(i32) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll @@ -83,7 +83,7 @@ vector.ph: ; preds = %entry %trip.count.minus.1 = add i32 %N, -1 - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -91,7 +91,7 @@ %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ] %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* @@ -118,6 +118,6 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) -declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll @@ -246,11 +246,11 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: adr r3, .LCPI5_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vadd.i32 q2, q0, r12 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-narrow.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-narrow.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-narrow.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-narrow.ll @@ -18,12 +18,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -50,5 +50,5 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32 immarg, <4 x i1>) -declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll @@ -20,12 +20,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -65,12 +65,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> , i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -110,12 +110,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> , <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -155,12 +155,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -200,12 +200,12 @@ %trip.count.minus.2 = add i32 %N, -2 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -245,12 +245,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -289,12 +289,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %incorrect = add i32 %index, 1 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer @@ -335,12 +335,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -380,12 +380,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -425,12 +425,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, %offsets @@ -470,12 +470,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -501,6 +501,6 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 -declare void @llvm.set.loop.iterations.i32(i32) #3 +declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll @@ -23,13 +23,13 @@ %0 = add i32 %n.vec, -8 %1 = lshr i32 %0, 3 %2 = add i32 %1, 1 - call void @llvm.set.loop.iterations.i32(i32 %2) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %2) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ] %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp8, %vector.body ] - %3 = phi i32 [ %2, %vector.ph], [ %4, %vector.body ] + %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, @@ -94,13 +94,13 @@ %0 = add i32 %n.vec, -8 %1 = lshr i32 %0, 3 %2 = add nuw nsw i32 %1, 1 - call void @llvm.set.loop.iterations.i32(i32 %2) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %2) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ] %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp6, %vector.body ] - %3 = phi i32 [ %2, %vector.ph], [ %4, %vector.body ] + %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, @@ -158,13 +158,13 @@ %0 = add i32 %n.vec, -8 %1 = lshr i32 %0, 3 %2 = add nuw nsw i32 %1, 1 - call void @llvm.set.loop.iterations.i32(i32 %2) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %2) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %entry], [ %index.next, %vector.body ] %vec.phi = phi <8 x i16> [ zeroinitializer, %entry], [ %tmp6, %vector.body ] - %3 = phi i32 [ %2, %entry ], [ %4, %vector.body ] + %3 = phi i32 [ %start, %entry ], [ %4, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, @@ -228,7 +228,7 @@ vector.ph: ; preds = %for.body %trip.count.minus.1 = add i32 %8, -1 - call void @llvm.set.loop.iterations.i32(i32 %7) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %7) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -236,7 +236,7 @@ %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %16, %vector.body ] - %9 = phi i32 [ %7, %vector.ph ], [ %17, %vector.body ] + %9 = phi i32 [ %start, %vector.ph ], [ %17, %vector.body ] %lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>* %lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>* %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %8) @@ -278,7 +278,7 @@ } declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) -declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll @@ -17,12 +17,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, @@ -76,13 +76,13 @@ %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer %broadcast.splatinsert10.store = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11.store = shufflevector <4 x i32> %broadcast.splatinsert10.store, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %store.idx = phi i32 [ 0, %vector.ph ], [ %store.idx.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %induction = add <8 x i32> %broadcast.splat, @@ -139,12 +139,12 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] + %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %induction = add <4 x i32> %broadcast.splat, @@ -178,7 +178,7 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32 immarg, <4 x i1>) -declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll @@ -7,14 +7,14 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: movs r2, #3 ; CHECK-NEXT: adr r3, .LCPI0_0 -; CHECK-NEXT: mov.w lr, #3 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: vldrw.u32 q2, [r3] ; CHECK-NEXT: vmov.i32 q0, #0x80000000 ; CHECK-NEXT: vmvn.i32 q1, #0x80000000 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vmov.i32 q3, #0xa -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vadd.i32 q4, q2, r2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir @@ -14,11 +14,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %lsr.iv.2 = phi i16* [ %scevgep.2, %vector.body ], [ %c, %vector.ph ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] @@ -40,7 +40,7 @@ } declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) @@ -132,7 +132,7 @@ renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg $r12 = t2MOVi16 32768, 14 /* CC::al */, $noreg $r12 = t2MOVTi16 killed $r12, 65535, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 $r5 = tMOVr killed $r3, 14 /* CC::al */, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll @@ -57,9 +57,9 @@ ; CHECK-NEXT: subs r3, r2, r3 ; CHECK-NEXT: add.w r12, r3, #3 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB1_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unrolled-and-vector.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unrolled-and-vector.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unrolled-and-vector.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unrolled-and-vector.mir @@ -37,18 +37,18 @@ br i1 %7, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new for.body.preheader.new: ; preds = %for.body.preheader - call void @llvm.set.loop.iterations.i32(i32 %11) + %start1 = call i32 @llvm.start.loop.iterations.i32(i32 %11) br label %for.body vector.ph: ; preds = %vector.memcheck - call void @llvm.set.loop.iterations.i32(i32 %5) + %start2 = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv50 = phi i8* [ %scevgep51, %vector.body ], [ %res, %vector.ph ] %lsr.iv47 = phi i8* [ %scevgep48, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i8* [ %scevgep45, %vector.body ], [ %a, %vector.ph ] - %12 = phi i32 [ %5, %vector.ph ], [ %17, %vector.body ] + %12 = phi i32 [ %start2, %vector.ph ], [ %17, %vector.body ] %13 = phi i32 [ %N, %vector.ph ], [ %15, %vector.body ] %lsr.iv5052 = bitcast i8* %lsr.iv50 to <16 x i8>* %lsr.iv4749 = bitcast i8* %lsr.iv47 to <16 x i8>* @@ -88,7 +88,7 @@ for.body: ; preds = %for.body, %for.body.preheader.new %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] - %21 = phi i32 [ %11, %for.body.preheader.new ], [ %30, %for.body ] + %21 = phi i32 [ %start1, %for.body.preheader.new ], [ %30, %for.body ] %scevgep23 = getelementptr i8, i8* %a, i32 %i.011 %scevgep2453 = bitcast i8* %scevgep23 to i8* %22 = load i8, i8* %scevgep2453, align 1 @@ -159,7 +159,7 @@ declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <16 x i1> @llvm.arm.mve.vctp8(i32) #4 @@ -429,7 +429,7 @@ renamable $r6 = t2BICri killed renamable $r6, 15, 14, $noreg, $noreg renamable $r6, dead $cpsr = tSUBi8 killed renamable $r6, 16, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r6, 35, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.5.vector.body: successors: %bb.5(0x7c000000), %bb.11(0x04000000) @@ -455,7 +455,7 @@ renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r3, 19, 14, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.7.for.body: successors: %bb.7(0x7c000000), %bb.8(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir @@ -7,14 +7,14 @@ entry: %scevgep = getelementptr i32, i32* %q, i32 -1 %scevgep3 = getelementptr i32, i32* %p, i32 -1 - call void @llvm.set.loop.iterations.i32(i32 %n) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %n) %limit = lshr i32 %n, 1 br label %while.body while.body: ; preds = %while.body, %entry %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ] %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ] - %tmp = phi i32 [ %n, %entry ], [ %tmp2, %while.body ] + %tmp = phi i32 [ %start, %entry ], [ %tmp2, %while.body ] %scevgep7 = getelementptr i32, i32* %lsr.iv, i32 1 %scevgep4 = getelementptr i32, i32* %lsr.iv4, i32 1 %tmp1 = load i32, i32* %scevgep7, align 4 @@ -33,7 +33,7 @@ } ; Function Attrs: noduplicate nounwind - declare void @llvm.set.loop.iterations.i32(i32) #0 + declare i32 @llvm.start.loop.iterations.i32(i32) #0 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 @@ -130,7 +130,7 @@ renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg renamable $r2 = t2LSRri renamable $lr, 1, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.1.while.body: successors: %bb.1(0x7c000000), %bb.2(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir @@ -7,14 +7,14 @@ entry: %scevgep = getelementptr i32, i32* %q, i32 -1 %scevgep3 = getelementptr i32, i32* %p, i32 -1 - call void @llvm.set.loop.iterations.i32(i32 %n) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %n) %limit = lshr i32 %n, 1 br label %while.body while.body: ; preds = %while.body, %entry %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ] %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ] - %tmp = phi i32 [ %n, %entry ], [ %tmp2, %while.body ] + %tmp = phi i32 [ %start, %entry ], [ %tmp2, %while.body ] %scevgep7 = getelementptr i32, i32* %lsr.iv, i32 1 %scevgep4 = getelementptr i32, i32* %lsr.iv4, i32 1 %tmp1 = load i32, i32* %scevgep7, align 4 @@ -33,7 +33,7 @@ } ; Function Attrs: noduplicate nounwind - declare void @llvm.set.loop.iterations.i32(i32) #0 + declare i32 @llvm.start.loop.iterations.i32(i32) #0 ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 @@ -129,7 +129,7 @@ frame-setup CFI_INSTRUCTION offset $r7, -8 renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg - t2DoLoopStart renamable $r0 + $lr = t2DoLoopStart renamable $r0 renamable $r2 = t2LSRri renamable $r0, 1, 14, $noreg, $noreg $lr = tMOVr $r0, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-liveout.mir deleted file mode 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-liveout.mir +++ /dev/null @@ -1,122 +0,0 @@ -# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s -# CHECK-NOT: $lr = t2DLS -# CHECK: $lr = tMOVr $r0, 14 -# CHECK-NOT: $lr = t2LEUpdate - ---- | - target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" - target triple = "thumbv8.1m.main" - - define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) { - entry: - %scevgep = getelementptr i32, i32* %q, i32 -1 - %scevgep3 = getelementptr i32, i32* %p, i32 -1 - call void @llvm.set.loop.iterations.i32(i32 %n) - br label %preheader - - preheader: - br label %while.body - - while.body: ; preds = %while.body, %entry - %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %preheader ] - %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %preheader ] - %0 = phi i32 [ %n, %preheader ], [ %2, %while.body ] - %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1 - %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1 - %1 = load i32, i32* %scevgep6, align 4 - store i32 %1, i32* %scevgep2, align 4 - %scevgep1 = getelementptr i32, i32* %lsr.iv, i32 1 - %scevgep5 = getelementptr i32, i32* %lsr.iv4, i32 1 - %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) - %3 = icmp ne i32 %2, 0 - br i1 %3, label %while.body, label %while.end - - while.end: ; preds = %while.body - ret i32 0 - } - - declare void @llvm.set.loop.iterations.i32(i32) #0 - declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 - - attributes #0 = { noduplicate nounwind } - attributes #1 = { nounwind } - -... ---- -name: do_copy -alignment: 2 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -registers: [] -liveins: - - { reg: '$r0', virtual-reg: '' } - - { reg: '$r1', virtual-reg: '' } - - { reg: '$r2', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 8 - offsetAdjustment: 0 - maxAlignment: 4 - adjustsStack: false - hasCalls: false - stackProtector: '' - maxCallFrameSize: 0 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - localFrameSize: 0 - savePoint: '' - restorePoint: '' -fixedStack: [] -stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -constants: [] -machineFunctionInfo: {} -body: | - bb.0.entry: - successors: %bb.1(0x80000000) - liveins: $r0, $r1, $r2, $r7, $lr - - frame-setup tPUSH 14, $noreg, killed $r7, implicit-def $sp, implicit $sp - frame-setup CFI_INSTRUCTION def_cfa_offset 8 - frame-setup CFI_INSTRUCTION offset $lr, -4 - frame-setup CFI_INSTRUCTION offset $r7, -8 - t2DoLoopStart $r0 - renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg - renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg - - bb.1.preheader: - successors: %bb.2(0x80000000) - liveins: $r0, $r1, $lr - $lr = tMOVr $r0, 14, $noreg - - bb.2.while.body: - successors: %bb.2(0x7c000000), %bb.3(0x04000000) - liveins: $lr, $r0, $r1 - - renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6) - early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2) - renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr - tB %bb.3, 14, $noreg - - bb.3.while.end: - $r0, dead $cpsr = tMOVi8 0, 14, $noreg - tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 - -... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir @@ -8,11 +8,11 @@ br i1 %cmp, label %exit, label %loop.ph loop.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %iters) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters) br label %loop.body loop.body: ; preds = %loop.body, %loop.ph - %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ] %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] @@ -44,11 +44,11 @@ br i1 %cmp, label %exit, label %loop.ph loop.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %iters) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %iters) br label %loop.body loop.body: ; preds = %loop.body, %loop.ph - %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %start, %loop.ph ] %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] @@ -75,7 +75,7 @@ ret void } - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) @@ -163,7 +163,7 @@ liveins: $r0, $r1, $r2, $r3, $r4, $lr renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r4 + $lr = t2DoLoopStart renamable $r4 $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg bb.2.loop.body: @@ -269,7 +269,7 @@ liveins: $r0, $r1, $r2, $r3, $r4, $lr renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r4 + $lr = t2DoLoopStart renamable $r4 $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg bb.2.loop.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir @@ -9,7 +9,7 @@ entry: %scevgep = getelementptr i32, i32* %q, i32 -1 %scevgep3 = getelementptr i32, i32* %p, i32 -1 - call void @llvm.set.loop.iterations.i32(i32 %n) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %n) br label %preheader preheader: @@ -18,7 +18,7 @@ while.body: ; preds = %while.body, %entry %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %preheader ] %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %preheader ] - %0 = phi i32 [ %n, %preheader ], [ %2, %while.body ] + %0 = phi i32 [ %start, %preheader ], [ %2, %while.body ] %scevgep6 = getelementptr i32, i32* %lsr.iv, i32 1 %scevgep2 = getelementptr i32, i32* %lsr.iv4, i32 1 %1 = load i32, i32* %scevgep6, align 4 @@ -33,7 +33,7 @@ ret i32 0 } - declare void @llvm.set.loop.iterations.i32(i32) #0 + declare i32 @llvm.start.loop.iterations.i32(i32) #0 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 attributes #0 = { noduplicate nounwind } @@ -89,11 +89,12 @@ ; CHECK-LABEL: name: do_copy ; CHECK: bb.0.entry: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $lr, $r2, $r7 + ; CHECK: liveins: $r0, $r2, $r7 ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: $lr = t2DLS killed $r0 ; CHECK: renamable $r0 = t2SUBri killed renamable $lr, 4, 14 /* CC::al */, $noreg, def dead $cpsr ; CHECK: renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: bb.1.preheader: @@ -105,9 +106,7 @@ ; CHECK: liveins: $lr, $r0, $r1 ; CHECK: renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.scevgep6) ; CHECK: early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.scevgep2) - ; CHECK: $lr = t2SUBri killed renamable $lr, 1, 14 /* CC::al */, $noreg, def $cpsr - ; CHECK: tBcc %bb.2, 1 /* CC::ne */, killed $cpsr - ; CHECK: tB %bb.3, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.while.end: ; CHECK: $r0, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 @@ -119,7 +118,7 @@ frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 - t2DoLoopStart $r0 + $lr = t2DoLoopStart $r0 renamable $r0 = t2SUBri killed renamable $lr, 4, 14, $noreg, def $cpsr renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir @@ -13,11 +13,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] @@ -51,11 +51,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] @@ -89,11 +89,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] @@ -127,11 +127,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ] @@ -165,11 +165,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] @@ -204,11 +204,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ] @@ -243,11 +243,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] @@ -282,11 +282,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ] @@ -321,11 +321,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] @@ -361,11 +361,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ] @@ -401,11 +401,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] @@ -440,11 +440,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ] @@ -479,11 +479,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] @@ -518,11 +518,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ] @@ -557,11 +557,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %store.addr = phi i32* [ %c, %vector.ph ], [ %store.next, %vector.body ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] @@ -596,11 +596,11 @@ br i1 %cmp9, label %exit, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] %acc = phi i32 [ 0, %vector.ph ], [ %acc.next, %vector.body ] @@ -635,7 +635,7 @@ br i1 %cmp22, label %while.body.preheader, label %while.end while.body.preheader: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %4) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %4) br label %while.body while.body: ; preds = %while.body.preheader, %while.body @@ -643,7 +643,7 @@ %y.addr.025 = phi i16* [ %add.ptr4, %while.body ], [ %y, %while.body.preheader ] %n.addr.023 = phi i32 [ %sub, %while.body ], [ %n, %while.body.preheader ] %acc = phi i32 [ %acc.next, %while.body ], [ 0, %while.body.preheader ] - %5 = phi i32 [ %4, %while.body.preheader ], [ %6, %while.body ] + %5 = phi i32 [ %start, %while.body.preheader ], [ %6, %while.body ] %tmp3 = bitcast i16* %y.addr.025 to <4 x i16>* %tmp1 = bitcast i16* %x.addr.026 to <4 x i16>* %tmp = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.023) @@ -678,7 +678,7 @@ br i1 %cmp22, label %while.body.preheader, label %while.end while.body.preheader: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %4) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %4) br label %while.body while.body: ; preds = %while.body.preheader, %while.body @@ -686,7 +686,7 @@ %y.addr.025 = phi i16* [ %add.ptr4, %while.body ], [ %y, %while.body.preheader ] %n.addr.023 = phi i32 [ %sub, %while.body ], [ %n, %while.body.preheader ] %acc = phi i32 [ %acc.next, %while.body ], [ 0, %while.body.preheader ] - %5 = phi i32 [ %4, %while.body.preheader ], [ %6, %while.body ] + %5 = phi i32 [ %start, %while.body.preheader ], [ %6, %while.body ] %tmp3 = bitcast i16* %y.addr.025 to <8 x i16>* %tmp1 = bitcast i16* %x.addr.026 to <8 x i16>* %tmp = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %n.addr.023) @@ -720,7 +720,7 @@ br i1 %cmp22, label %while.body.preheader, label %while.end while.body.preheader: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %4) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %4) br label %while.body while.body: ; preds = %while.body.preheader, %while.body @@ -728,7 +728,7 @@ %y.addr.025 = phi i16* [ %add.ptr4, %while.body ], [ %y, %while.body.preheader ] %n.addr.023 = phi i32 [ %sub, %while.body ], [ %n, %while.body.preheader ] %acc = phi i32 [ %acc.next, %while.body ], [ 0, %while.body.preheader ] - %5 = phi i32 [ %4, %while.body.preheader ], [ %6, %while.body ] + %5 = phi i32 [ %start, %while.body.preheader ], [ %6, %while.body ] %tmp3 = bitcast i16* %y.addr.025 to <8 x i16>* %tmp1 = bitcast i16* %x.addr.026 to <8 x i16>* %tmp = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %n.addr.023) @@ -763,7 +763,7 @@ br i1 %cmp22, label %while.body.preheader, label %while.end while.body.preheader: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %4) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %4) br label %while.body while.body: ; preds = %while.body.preheader, %while.body @@ -771,7 +771,7 @@ %y.addr.025 = phi i16* [ %add.ptr4, %while.body ], [ %y, %while.body.preheader ] %n.addr.023 = phi i32 [ %sub, %while.body ], [ %n, %while.body.preheader ] %acc = phi i32 [ %acc.next, %while.body ], [ 0, %while.body.preheader ] - %5 = phi i32 [ %4, %while.body.preheader ], [ %6, %while.body ] + %5 = phi i32 [ %start, %while.body.preheader ], [ %6, %while.body ] %tmp3 = bitcast i16* %y.addr.025 to <8 x i16>* %tmp1 = bitcast i16* %x.addr.026 to <8 x i16>* %tmp = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %n.addr.023) @@ -803,7 +803,7 @@ declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) @@ -887,7 +887,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -986,7 +986,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -1085,7 +1085,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 7, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -1185,7 +1185,7 @@ renamable $r2 = t2BICri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $r2 + $lr = t2DoLoopStart renamable $r2 $r3 = tMOVr killed $r2, 14 /* CC::al */, $noreg renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg @@ -1304,7 +1304,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -1417,7 +1417,7 @@ renamable $r2 = t2BICri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $r2 + $lr = t2DoLoopStart renamable $r2 $r3 = tMOVr killed $r2, 14 /* CC::al */, $noreg renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg @@ -1537,7 +1537,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -1650,7 +1650,7 @@ renamable $r2 = t2BICri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $r2 + $lr = t2DoLoopStart renamable $r2 $r3 = tMOVr killed $r2, 14 /* CC::al */, $noreg renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg @@ -1779,7 +1779,7 @@ renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -1904,7 +1904,7 @@ renamable $d1 = VLDRD $sp, 2, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0 :: (load 8 from %fixed-stack.0) renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $r2 + $lr = t2DoLoopStart renamable $r2 $r4 = tMOVr killed $r2, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -2032,7 +2032,7 @@ renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -2157,7 +2157,7 @@ renamable $d1 = VLDRD $sp, 2, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0 :: (load 8 from %fixed-stack.0) renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $r2 + $lr = t2DoLoopStart renamable $r2 $r4 = tMOVr killed $r2, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -2285,7 +2285,7 @@ renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg renamable $r3 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -2410,7 +2410,7 @@ renamable $d1 = VLDRD $sp, 2, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0 :: (load 8 from %fixed-stack.0) renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 27, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $r2 + $lr = t2DoLoopStart renamable $r2 $r4 = tMOVr killed $r2, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -2538,7 +2538,7 @@ renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg renamable $r3 = tADDrSPi $sp, 2, 14 /* CC::al */, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from %fixed-stack.0, align 8) - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -2663,7 +2663,7 @@ renamable $d1 = VLDRD $sp, 2, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0 :: (load 8 from %fixed-stack.0) renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 27, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $r2 + $lr = t2DoLoopStart renamable $r2 $r4 = tMOVr killed $r2, 14 /* CC::al */, $noreg bb.2.vector.body: @@ -2781,7 +2781,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.while.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -2897,7 +2897,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.while.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -3026,7 +3026,7 @@ renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.while.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -3150,7 +3150,7 @@ renamable $r12 = t2ADDri killed renamable $r2, 7, 14 /* CC::al */, $noreg, $noreg renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $r2 = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 27, 14 /* CC::al */, $noreg, $noreg - t2DoLoopStart renamable $r2 + $lr = t2DoLoopStart renamable $r2 $r12 = tMOVr killed $r2, 14 /* CC::al */, $noreg renamable $r2, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -26,7 +26,7 @@ ; ENABLED-NEXT: ldr r0, [sp, #36] ; ENABLED-NEXT: add.w r12, r2, #3 ; ENABLED-NEXT: ldr.w r10, [sp] @ 4-byte Reload -; ENABLED-NEXT: movs r6, #0 +; ENABLED-NEXT: mov.w r8, #0 ; ENABLED-NEXT: mov r9, r12 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 @@ -37,32 +37,32 @@ ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: lsrs r0, r0, #16 ; ENABLED-NEXT: sub.w r9, r9, #1 -; ENABLED-NEXT: strh.w r0, [r1, r6, lsl #1] -; ENABLED-NEXT: adds r6, #1 +; ENABLED-NEXT: strh.w r0, [r1, r8, lsl #1] +; ENABLED-NEXT: add.w r8, r8, #1 ; ENABLED-NEXT: add.w r10, r10, #2 -; ENABLED-NEXT: cmp r6, r3 +; ENABLED-NEXT: cmp r8, r3 ; ENABLED-NEXT: beq .LBB0_8 ; ENABLED-NEXT: .LBB0_4: @ %for.body ; ENABLED-NEXT: @ =>This Loop Header: Depth=1 ; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2 -; ENABLED-NEXT: cmp r2, r6 +; ENABLED-NEXT: cmp r2, r8 ; ENABLED-NEXT: ble .LBB0_2 ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 ; ENABLED-NEXT: movs r7, #1 ; ENABLED-NEXT: subs r0, #4 -; ENABLED-NEXT: subs r4, r2, r6 +; ENABLED-NEXT: sub.w r4, r2, r8 ; ENABLED-NEXT: vmov.i32 q1, #0x0 -; ENABLED-NEXT: add.w r8, r7, r0, lsr #2 -; ENABLED-NEXT: sub.w r0, r12, r6 +; ENABLED-NEXT: add.w r6, r7, r0, lsr #2 +; ENABLED-NEXT: sub.w r0, r12, r8 ; ENABLED-NEXT: bic r0, r0, #3 ; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 ; ENABLED-NEXT: mov r7, r10 ; ENABLED-NEXT: dls lr, r0 ; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload -; ENABLED: .LBB0_6: @ %vector.body +; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 ; ENABLED-NEXT: vctp.32 r4 @@ -70,9 +70,9 @@ ; ENABLED-NEXT: vpstt ; ENABLED-NEXT: vldrht.s32 q1, [r0], #8 ; ENABLED-NEXT: vldrht.s32 q2, [r7], #8 -; ENABLED-NEXT: mov lr, r8 +; ENABLED-NEXT: mov lr, r6 ; ENABLED-NEXT: vmul.i32 q1, q2, q1 -; ENABLED-NEXT: sub.w r8, r8, #1 +; ENABLED-NEXT: subs r6, #1 ; ENABLED-NEXT: vshl.s32 q1, r5 ; ENABLED-NEXT: subs r4, #4 ; ENABLED-NEXT: vadd.i32 q1, q1, q0 @@ -97,7 +97,7 @@ ; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] ; NOREDUCTIONS-NEXT: add.w r12, r2, #3 ; NOREDUCTIONS-NEXT: ldr.w r10, [sp] @ 4-byte Reload -; NOREDUCTIONS-NEXT: movs r6, #0 +; NOREDUCTIONS-NEXT: mov.w r8, #0 ; NOREDUCTIONS-NEXT: mov r9, r12 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 @@ -108,31 +108,31 @@ ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: lsrs r0, r0, #16 ; NOREDUCTIONS-NEXT: sub.w r9, r9, #1 -; NOREDUCTIONS-NEXT: strh.w r0, [r1, r6, lsl #1] -; NOREDUCTIONS-NEXT: adds r6, #1 +; NOREDUCTIONS-NEXT: strh.w r0, [r1, r8, lsl #1] +; NOREDUCTIONS-NEXT: add.w r8, r8, #1 ; NOREDUCTIONS-NEXT: add.w r10, r10, #2 -; NOREDUCTIONS-NEXT: cmp r6, r3 -; NOREDUCTIONS: beq .LBB0_8 +; NOREDUCTIONS-NEXT: cmp r8, r3 +; NOREDUCTIONS-NEXT: beq .LBB0_8 ; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body ; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1 ; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2 -; NOREDUCTIONS-NEXT: cmp r2, r6 +; NOREDUCTIONS-NEXT: cmp r2, r8 ; NOREDUCTIONS-NEXT: ble .LBB0_2 ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 ; NOREDUCTIONS-NEXT: movs r7, #1 ; NOREDUCTIONS-NEXT: subs r0, #4 -; NOREDUCTIONS-NEXT: subs r4, r2, r6 +; NOREDUCTIONS-NEXT: sub.w r4, r2, r8 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 -; NOREDUCTIONS-NEXT: add.w r8, r7, r0, lsr #2 -; NOREDUCTIONS-NEXT: sub.w r0, r12, r6 +; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2 +; NOREDUCTIONS-NEXT: sub.w r0, r12, r8 ; NOREDUCTIONS-NEXT: bic r0, r0, #3 ; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r7, r10 ; NOREDUCTIONS-NEXT: dls lr, r0 -; NOREDUCTIONS: ldr r0, [sp] @ 4-byte Reload +; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 @@ -141,9 +141,9 @@ ; NOREDUCTIONS-NEXT: vpstt ; NOREDUCTIONS-NEXT: vldrht.s32 q1, [r0], #8 ; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r7], #8 -; NOREDUCTIONS-NEXT: mov lr, r8 +; NOREDUCTIONS-NEXT: mov lr, r6 ; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1 -; NOREDUCTIONS-NEXT: sub.w r8, r8, #1 +; NOREDUCTIONS-NEXT: subs r6, #1 ; NOREDUCTIONS-NEXT: vshl.s32 q1, r5 ; NOREDUCTIONS-NEXT: subs r4, #4 ; NOREDUCTIONS-NEXT: vadd.i32 q1, q1, q0 @@ -184,7 +184,7 @@ vector.ph: ; preds = %for.body %trip.count.minus.1 = add i32 %i8, -1 - call void @llvm.set.loop.iterations.i32(i32 %i7) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %i7) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -192,7 +192,7 @@ %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ] %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %i16, %vector.body ] - %i9 = phi i32 [ %i7, %vector.ph ], [ %i17, %vector.body ] + %i9 = phi i32 [ %start, %vector.ph ], [ %i17, %vector.body ] %lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>* %lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>* %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i8) @@ -237,4 +237,4 @@ declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) -declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.start.loop.iterations.i32(i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll @@ -51,8 +51,8 @@ ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vmov.i32 q1, #0x1 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: dlstp.32 lr, r0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir @@ -17,11 +17,11 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ] @@ -56,7 +56,7 @@ } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) @@ -174,7 +174,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir @@ -17,13 +17,13 @@ br i1 %tmp, label %bb27, label %bb3 bb3: ; preds = %bb - call void @llvm.set.loop.iterations.i32(i32 %tmp6) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp6) br label %bb9 bb9: ; preds = %bb9, %bb3 %lsr.iv2 = phi i32* [ %scevgep3, %bb9 ], [ %arg1, %bb3 ] %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ] - %tmp7 = phi i32 [ %tmp6, %bb3 ], [ %tmp12, %bb9 ] + %tmp7 = phi i32 [ %start, %bb3 ], [ %tmp12, %bb9 ] %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ] %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>* %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>* @@ -47,7 +47,7 @@ } declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) @@ -163,7 +163,7 @@ VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0) $r3 = tMOVr $r0, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.bb9: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir @@ -16,13 +16,13 @@ br i1 %tmp, label %bb27, label %bb3 bb3: ; preds = %bb - call void @llvm.set.loop.iterations.i32(i32 %tmp6) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp6) br label %bb9 bb9: ; preds = %bb9, %bb3 %lsr.iv2 = phi i32* [ %scevgep3, %bb9 ], [ %arg1, %bb3 ] %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ] - %tmp7 = phi i32 [ %tmp6, %bb3 ], [ %tmp12, %bb9 ] + %tmp7 = phi i32 [ %start, %bb3 ], [ %tmp12, %bb9 ] %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ] %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>* %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>* @@ -78,7 +78,7 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #4 @@ -193,7 +193,7 @@ VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0) $r3 = tMOVr $r0, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.bb9: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -335,7 +335,7 @@ VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0) $r3 = tMOVr $r0, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.bb9: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -478,7 +478,7 @@ VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0) $r3 = tMOVr $r0, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.bb9: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -621,7 +621,7 @@ VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0) $r3 = tMOVr $r0, 14, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.bb9: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir @@ -14,14 +14,14 @@ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* @@ -42,7 +42,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.start.loop.iterations.i32(i32) #1 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #2 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 @@ -143,7 +143,7 @@ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir @@ -14,14 +14,14 @@ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* @@ -42,7 +42,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) @@ -142,7 +142,7 @@ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir @@ -14,14 +14,14 @@ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* @@ -42,7 +42,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) @@ -142,7 +142,7 @@ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir @@ -14,7 +14,7 @@ br i1 %cmp11, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) %6 = shl i32 %4, 3 %7 = sub i32 %N, %6 br label %vector.body @@ -23,7 +23,7 @@ %lsr.iv20 = phi i8* [ %scevgep21, %vector.body ], [ %c, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %b, %vector.ph ] %vec.phi = phi <8 x i16> [ , %vector.ph ], [ %15, %vector.body ] - %8 = phi i32 [ %5, %vector.ph ], [ %16, %vector.body ] + %8 = phi i32 [ %start, %vector.ph ], [ %16, %vector.body ] %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ] %lsr.iv2022 = bitcast i8* %lsr.iv20 to <8 x i8>* %lsr.iv19 = bitcast i8* %lsr.iv to <8 x i8>* @@ -55,7 +55,7 @@ } declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) @@ -189,7 +189,7 @@ renamable $r12 = t2LSRri killed renamable $r12, 3, 14, $noreg, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) renamable $r3 = t2SUBrs renamable $r2, killed renamable $r12, 26, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -15,10 +15,10 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr -; CHECK: .LBB0_2: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 @@ -92,10 +92,10 @@ ; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w lr, r3, r1, lsr #2 +; CHECK-NEXT: add.w r1, r3, r1, lsr #2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: dls lr, lr -; CHECK: .LBB1_2: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 @@ -163,10 +163,10 @@ ; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w lr, r3, r1, lsr #2 +; CHECK-NEXT: add.w r1, r3, r1, lsr #2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: dls lr, lr -; CHECK: .LBB2_2: @ %vector.body +; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 @@ -228,9 +228,9 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB3_1: @ %vector.ph -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK: .LBB3_2: @ %vector.body +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 @@ -285,9 +285,9 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB4_1: @ %vector.ph -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK: .LBB4_2: @ %vector.body +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 @@ -342,9 +342,9 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB5_1: @ %vector.ph -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.8 lr, r3 -; CHECK: .LBB5_2: @ %vector.body +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #16 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16 @@ -402,9 +402,9 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB6_1: @ %vector.ph -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 -; CHECK: .LBB6_2: @ %vector.body +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: vec_mul_reduce_add ; CHECK: vector.ph: -; CHECK: call void @llvm.set.loop.iterations.i32 +; CHECK: %start = call i32 @llvm.start.loop.iterations.i32 ; CHECK: br label %vector.body ; CHECK: vector.body: @@ -33,7 +33,7 @@ %trip.count.minus.1 = add i32 %N, -1 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph @@ -41,7 +41,7 @@ %lsr.iv2 = phi i32* [ %scevgep3, %vector.body ], [ %a, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %b, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ] - %6 = phi i32 [ %5, %vector.ph ], [ %10, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %10, %vector.body ] %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>* %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>* %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 @@ -74,6 +74,6 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) -declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-unroll.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-unroll.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-unroll.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-unroll.ll @@ -23,7 +23,7 @@ br i1 %0, label %for.cond.cleanup.loopexit.unr-lcssa, label %vector.ph.new vector.ph.new: ; preds = %vector.ph - call void @llvm.set.loop.iterations.i32(i32 %tmp13) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) %unroll_iter = sub i32 %tmp13, %xtraiter br label %vector.body @@ -113,6 +113,6 @@ declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) #2 -declare void @llvm.set.loop.iterations.i32(i32) #3 +declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir @@ -18,12 +18,12 @@ vector.ph: ; preds = %entry %sub = sub nsw i32 0, %x - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %A, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %18, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %18, %vector.body ] %7 = phi i32 [ %n, %vector.ph ], [ %9, %vector.body ] %lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>* %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) @@ -84,12 +84,12 @@ vector.ph: ; preds = %entry %sub = sub nsw i32 0, %T - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %data, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %18, %vector.body ] + %6 = phi i32 [ %start, %vector.ph ], [ %18, %vector.body ] %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] %lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>* %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) @@ -151,7 +151,7 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) ... @@ -251,7 +251,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -382,7 +382,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -512,7 +512,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -632,7 +632,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -748,7 +748,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -862,7 +862,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) @@ -983,7 +983,7 @@ renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir @@ -14,7 +14,7 @@ br i1 %cmp11, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) %6 = shl i32 %4, 3 %7 = sub i32 %N, %6 br label %vector.body @@ -23,7 +23,7 @@ %lsr.iv20 = phi i8* [ %scevgep21, %vector.body ], [ %c, %vector.ph ] %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %b, %vector.ph ] %vec.phi = phi <8 x i16> [ , %vector.ph ], [ %15, %vector.body ] - %8 = phi i32 [ %5, %vector.ph ], [ %16, %vector.body ] + %8 = phi i32 [ %start, %vector.ph ], [ %16, %vector.body ] %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ] %lsr.iv2022 = bitcast i8* %lsr.iv20 to <8 x i8>* %lsr.iv19 = bitcast i8* %lsr.iv to <8 x i8>* @@ -55,7 +55,7 @@ } declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) - declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.start.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) @@ -189,7 +189,7 @@ renamable $r12 = t2LSRri killed renamable $r12, 2, 14, $noreg, $noreg renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) renamable $r3 = t2SUBrs renamable $r2, killed renamable $r12, 26, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + $lr = t2DoLoopStart renamable $lr bb.2.vector.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir @@ -17,11 +17,11 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %tmp5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ] @@ -61,7 +61,7 @@ } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 @@ -182,7 +182,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $r3 + $lr = t2DoLoopStart renamable $r3 $r12 = tMOVr killed $r3, 14, $noreg $r3 = tMOVr $r2, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir @@ -16,11 +16,11 @@ br i1 %cmp9, label %for.cond.cleanup, label %vector.ph vector.ph: ; preds = %entry - call void @llvm.set.loop.iterations.i32(i32 %5) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) br label %vector.body vector.body: ; preds = %vector.body, %vector.ph - %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %5, %vector.ph ] + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ] %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ] @@ -54,7 +54,7 @@ } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2 - declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.start.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 ... @@ -170,7 +170,7 @@ renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $r12 + $lr = t2DoLoopStart renamable $r12 $r3 = tMOVr killed $r12, 14, $noreg bb.2.vector.body: diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -723,9 +723,9 @@ ; CHECK: @ %bb.0: @ %for.body.us.preheader ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: ldrd lr, r12, [sp, #16] +; CHECK-NEXT: ldrd r3, r12, [sp, #16] +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: lsl.w r3, r12, #1 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB14_1: @ %for.body.us ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 @@ -1083,11 +1083,11 @@ ; CHECK-NEXT: beq.w .LBB16_12 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph ; CHECK-NEXT: ldrh r4, [r0] -; CHECK-NEXT: lsr.w r10, r3, #2 +; CHECK-NEXT: lsr.w r9, r3, #2 ; CHECK-NEXT: ldrd r5, r12, [r0, #4] ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: sub.w r0, r4, #8 -; CHECK-NEXT: and r9, r0, #7 +; CHECK-NEXT: and r8, r0, #7 ; CHECK-NEXT: add.w r7, r0, r0, lsr #29 ; CHECK-NEXT: asrs r6, r7, #3 ; CHECK-NEXT: cmp r6, #1 @@ -1106,7 +1106,7 @@ ; CHECK-NEXT: .LBB16_3: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: subs.w r10, r10, #1 +; CHECK-NEXT: subs.w r9, r9, #1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vstrb.8 q0, [r2], #8 ; CHECK-NEXT: add.w r0, r5, r0, lsl #1 @@ -1126,13 +1126,13 @@ ; CHECK-NEXT: ldrh.w r3, [r12, #6] ; CHECK-NEXT: ldrh.w r6, [r12, #4] ; CHECK-NEXT: ldrh.w r11, [r12, #2] -; CHECK-NEXT: ldrh.w r8, [r12] +; CHECK-NEXT: ldrh.w r10, [r12] ; CHECK-NEXT: vstrb.8 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q0, [r5] ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: adds r1, r5, #2 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmul.f16 q0, q0, r8 +; CHECK-NEXT: vmul.f16 q0, q0, r10 ; CHECK-NEXT: adds r1, r5, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r11 ; CHECK-NEXT: vldrw.u32 q1, [r5, #4] @@ -1155,8 +1155,8 @@ ; CHECK-NEXT: blo .LBB16_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr.w lr, [sp] @ 4-byte Reload -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB16_6: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 @@ -1196,13 +1196,13 @@ ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB16_8: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: beq.w .LBB16_3 ; CHECK-NEXT: b .LBB16_9 ; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 ; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov lr, r9 +; CHECK-NEXT: mov lr, r8 ; CHECK-NEXT: .LBB16_10: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -1214,7 +1214,7 @@ ; CHECK-NEXT: b .LBB16_11 ; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: add.w r5, r5, r9, lsl #1 +; CHECK-NEXT: add.w r5, r5, r8, lsl #1 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_12: @ %if.end ; CHECK-NEXT: add sp, #24 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -701,9 +701,9 @@ ; CHECK: @ %bb.0: @ %for.body.us.preheader ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: ldrd lr, r12, [sp, #16] +; CHECK-NEXT: ldrd r3, r12, [sp, #16] +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: lsl.w r3, r12, #2 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB14_1: @ %for.body.us ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 @@ -1088,8 +1088,8 @@ ; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: ldrd r7, r4, [r12] -; CHECK-NEXT: ldrd r0, r6, [r12, #8] +; CHECK-NEXT: ldrd r7, r6, [r12] +; CHECK-NEXT: ldrd r0, r4, [r12, #8] ; CHECK-NEXT: ldrd r3, lr, [r12, #16] ; CHECK-NEXT: ldrd r11, r8, [r12, #24] ; CHECK-NEXT: vstrb.8 q0, [r9], #16 @@ -1099,11 +1099,11 @@ ; CHECK-NEXT: vmul.f32 q0, q0, r7 ; CHECK-NEXT: vldrw.u32 q6, [r5, #-24] ; CHECK-NEXT: vldrw.u32 q4, [r5, #-20] -; CHECK-NEXT: vfma.f32 q0, q1, r4 +; CHECK-NEXT: vfma.f32 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q5, [r5, #-16] ; CHECK-NEXT: vfma.f32 q0, q6, r0 ; CHECK-NEXT: vldrw.u32 q2, [r5, #-12] -; CHECK-NEXT: vfma.f32 q0, q4, r6 +; CHECK-NEXT: vfma.f32 q0, q4, r4 ; CHECK-NEXT: vldrw.u32 q3, [r5, #-8] ; CHECK-NEXT: vfma.f32 q0, q5, r3 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload @@ -1115,8 +1115,8 @@ ; CHECK-NEXT: blo .LBB16_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr.w lr, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: .LBB16_6: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 @@ -1155,13 +1155,13 @@ ; CHECK-NEXT: b .LBB16_9 ; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: mov r4, r5 +; CHECK-NEXT: mov r6, r5 ; CHECK-NEXT: mov lr, r3 ; CHECK-NEXT: .LBB16_10: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldr r0, [r7], #4 -; CHECK-NEXT: vldrw.u32 q1, [r4], #4 +; CHECK-NEXT: vldrw.u32 q1, [r6], #4 ; CHECK-NEXT: subs.w lr, lr, #1 ; CHECK-NEXT: vfma.f32 q0, q1, r0 ; CHECK-NEXT: bne .LBB16_10 @@ -1404,15 +1404,13 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biquad_cascade_stereo_df2T_instance_f32* nocapture readonly %0, float* %1, float* %2, i32 %3) { ; CHECK-LABEL: arm_biquad_cascade_stereo_df2T_f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: .pad #24 ; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: ldrb.w lr, [r0] +; CHECK-NEXT: ldrb.w r8, [r0] ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: ldrd r12, r0, [r0, #4] ; CHECK-NEXT: cmp r3, #0 @@ -1424,45 +1422,43 @@ ; CHECK-NEXT: mov r4, sp ; CHECK-NEXT: .LBB17_2: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB17_3 Depth 2 -; CHECK-NEXT: mov r7, lr -; CHECK-NEXT: ldr.w lr, [r0, #12] -; CHECK-NEXT: ldrd r5, r6, [r0] +; CHECK-NEXT: ldrd r5, r7, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r12] -; CHECK-NEXT: vldr s12, [r0, #8] -; CHECK-NEXT: vdup.32 q2, lr -; CHECK-NEXT: vldr s14, [r0, #16] +; CHECK-NEXT: vldr s8, [r0, #8] +; CHECK-NEXT: ldr r6, [r0, #12] ; CHECK-NEXT: vstrw.32 q1, [r4] -; CHECK-NEXT: vdup.32 q1, r6 -; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vdup.32 q1, r7 +; CHECK-NEXT: vldr s12, [r0, #16] +; CHECK-NEXT: vmov.f32 s6, s8 ; CHECK-NEXT: dls lr, r3 -; CHECK-NEXT: vmov.f32 s7, s12 -; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vmov.f32 s7, s8 +; CHECK-NEXT: vdup.32 q2, r6 +; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: mov r7, r2 +; CHECK-NEXT: vmov.f32 s11, s12 ; CHECK-NEXT: .LBB17_3: @ Parent Loop BB17_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q4, [r1, q0, uxtw #2] ; CHECK-NEXT: vldrw.u32 q5, [r4, q0, uxtw #2] ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: vfma.f32 q5, q4, r5 -; CHECK-NEXT: vstmia r6, {s20, s21} -; CHECK-NEXT: adds r6, #8 +; CHECK-NEXT: vstmia r7, {s20, s21} +; CHECK-NEXT: adds r7, #8 ; CHECK-NEXT: vldrw.u32 q3, [sp, #8] ; CHECK-NEXT: vfma.f32 q3, q5, q2 ; CHECK-NEXT: vfma.f32 q3, q4, q1 ; CHECK-NEXT: vstrw.32 q3, [r4] ; CHECK-NEXT: le lr, .LBB17_3 ; CHECK-NEXT: @ %bb.4: @ in Loop: Header=BB17_2 Depth=1 -; CHECK-NEXT: mov lr, r7 -; CHECK-NEXT: adds r0, #20 -; CHECK-NEXT: subs.w lr, r7, #1 +; CHECK-NEXT: subs.w r8, r8, #1 +; CHECK-NEXT: add.w r0, r0, #20 ; CHECK-NEXT: vstrb.8 q3, [r12], #16 ; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: bne .LBB17_2 ; CHECK-NEXT: b .LBB17_7 ; CHECK-NEXT: .LBB17_5: @ %.preheader +; CHECK-NEXT: dls lr, r8 ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB17_6: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r12], #16 ; CHECK-NEXT: vstrw.32 q0, [r0] @@ -1470,8 +1466,7 @@ ; CHECK-NEXT: .LBB17_7: ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} %5 = alloca [6 x float], align 4 %6 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, %struct.arm_biquad_cascade_stereo_df2T_instance_f32* %0, i32 0, i32 1 %7 = load float*, float** %6, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll @@ -11,8 +11,8 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -74,8 +74,8 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -138,8 +138,8 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -201,8 +201,8 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -332,8 +332,8 @@ ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vdup.32 q0, r4 ; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #4 @@ -396,19 +396,28 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB6_1: @ %vector.ph +; CHECK-NEXT: add.w r12, r3, #3 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: bic r12, r12, #3 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: add.w r12, lr, r12, lsr #2 +; CHECK-NEXT: dls lr, r12 ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 +; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vfms.f32 q3, q2, q1 -; CHECK-NEXT: vstrw.32 q3, [r2], #16 -; CHECK-NEXT: letp lr, .LBB6_2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q3, [r2], #16 +; CHECK-NEXT: le lr, .LBB6_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: @@ -462,19 +471,28 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB7_1: @ %vector.ph +; CHECK-NEXT: add.w r12, r3, #3 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: bic r12, r12, #3 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: add.w r12, lr, r12, lsr #2 +; CHECK-NEXT: dls lr, r12 ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 +; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vfms.f32 q3, q2, q1 -; CHECK-NEXT: vstrw.32 q3, [r2], #16 -; CHECK-NEXT: letp lr, .LBB7_2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q3, [r2], #16 +; CHECK-NEXT: le lr, .LBB7_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: @@ -593,18 +611,27 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB9_1: @ %vector.ph +; CHECK-NEXT: add.w r12, r3, #3 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: bic r12, r12, #3 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: add.w r12, lr, r12, lsr #2 +; CHECK-NEXT: dls lr, r12 ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB9_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 +; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vfms.f32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [r2], #16 -; CHECK-NEXT: letp lr, .LBB9_2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q2, [r2], #16 +; CHECK-NEXT: le lr, .LBB9_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: @@ -659,8 +686,8 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB10_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -724,8 +751,8 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB11_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB11_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll @@ -6,10 +6,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w r2, #256 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: adr r2, .LCPI0_0 -; CHECK-NEXT: mov.w lr, #256 ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 @@ -52,10 +52,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r2, #128 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: adr r2, .LCPI1_0 -; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] @@ -102,10 +102,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r2, #64 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: adr r2, .LCPI2_0 -; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] @@ -160,10 +160,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w r2, #256 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: adr r2, .LCPI3_0 -; CHECK-NEXT: mov.w lr, #256 ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u32 q1, [r0], #8 @@ -206,10 +206,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r2, #128 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: adr r2, .LCPI4_0 -; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u32 q1, [r0, #8] @@ -256,10 +256,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r2, #64 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: adr r2, .LCPI5_0 -; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u32 q1, [r0, #24] @@ -314,10 +314,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w r2, #256 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: adr r2, .LCPI6_0 -; CHECK-NEXT: mov.w lr, #256 ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u32 q1, [r0], #8 @@ -362,10 +362,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r2, #128 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: adr r2, .LCPI7_0 -; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u32 q1, [r0, #8] @@ -415,10 +415,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r2, #64 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: adr r2, .LCPI8_0 -; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u32 q1, [r0, #24] @@ -478,10 +478,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r2, #128 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: adr r2, .LCPI9_0 -; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16 @@ -534,10 +534,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r2, #128 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: adr r2, .LCPI10_0 -; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -328,9 +328,9 @@ ; CHECK-NEXT: .LBB8_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB8_3 Depth 2 +; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: .LBB8_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB8_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -405,11 +405,11 @@ ; CHECK-NEXT: .LBB9_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -501,9 +501,9 @@ ; CHECK-NEXT: .LBB10_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB10_3 Depth 2 +; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: .LBB10_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB10_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -570,20 +570,21 @@ ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: blt .LBB11_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: bic r1, r2, #7 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: sub.w r3, r1, #8 +; CHECK-NEXT: bic r8, r2, #7 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: sub.w r6, r8, #8 ; CHECK-NEXT: vmov.i16 q1, #0x8 +; CHECK-NEXT: add.w r1, r5, r6, lsr #3 +; CHECK-NEXT: adr r6, .LCPI11_0 +; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r8, r6, r3, lsr #3 -; CHECK-NEXT: adr r3, .LCPI11_0 -; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: .LBB11_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: dls lr, r8 +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB11_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -606,7 +607,7 @@ ; CHECK-NEXT: vmov r7, s16 ; CHECK-NEXT: vmov.32 q3[2], r5 ; CHECK-NEXT: vmov.u16 r5, q2[3] -; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov r3, s17 ; CHECK-NEXT: vmov.32 q3[3], r5 ; CHECK-NEXT: vadd.i16 q2, q2, q1 ; CHECK-NEXT: vmovlb.s16 q3, q3 @@ -617,7 +618,7 @@ ; CHECK-NEXT: vmov r12, s13 ; CHECK-NEXT: ldrh.w r11, [r7] ; CHECK-NEXT: vmov r7, s12 -; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh.w r9, [r5] ; CHECK-NEXT: vmov r5, s18 ; CHECK-NEXT: ldrh.w r10, [r6] @@ -630,16 +631,15 @@ ; CHECK-NEXT: vmov.16 q3[3], r9 ; CHECK-NEXT: vmov.16 q3[4], r11 ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q3[5], r4 +; CHECK-NEXT: vmov.16 q3[5], r3 ; CHECK-NEXT: ldrh r6, [r6] ; CHECK-NEXT: vmov.16 q3[6], r5 ; CHECK-NEXT: vmov.16 q3[7], r6 -; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: vstrb.8 q3, [r4], #16 ; CHECK-NEXT: le lr, .LBB11_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: cmp r1, r2 +; CHECK-NEXT: cmp r8, r2 ; CHECK-NEXT: bne .LBB11_2 ; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #8 @@ -704,42 +704,43 @@ ; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill ; CHECK-NEXT: blt.w .LBB12_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: bic r1, r2, #7 -; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: sub.w r3, r1, #8 +; CHECK-NEXT: bic r8, r2, #7 ; CHECK-NEXT: adr r6, .LCPI12_2 +; CHECK-NEXT: sub.w r3, r8, #8 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: movs r7, #1 ; CHECK-NEXT: vmov.i16 q3, #0x18 -; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill -; CHECK-NEXT: add.w r8, r7, r3, lsr #3 -; CHECK-NEXT: adr r7, .LCPI12_1 -; CHECK-NEXT: vldrw.u32 q0, [r7] +; CHECK-NEXT: add.w r1, r7, r3, lsr #3 ; CHECK-NEXT: adr r3, .LCPI12_0 -; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: adr r7, .LCPI12_1 +; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r7] +; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB12_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 -; CHECK-NEXT: dls lr, r8 -; CHECK-NEXT: ldr r3, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #56] @ 4-byte Reload +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: ldr r4, [sp, #60] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload ; CHECK-NEXT: .LBB12_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vmov.u16 r4, q5[0] +; CHECK-NEXT: vmov.u16 r3, q5[0] ; CHECK-NEXT: vmov.u16 r7, q7[4] -; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov.u16 r4, q5[1] -; CHECK-NEXT: vmov.32 q0[1], r4 -; CHECK-NEXT: vmov.u16 r4, q5[2] -; CHECK-NEXT: vmov.32 q0[2], r4 -; CHECK-NEXT: vmov.u16 r4, q5[3] -; CHECK-NEXT: vmov.32 q0[3], r4 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.u16 r3, q5[1] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.u16 r3, q5[2] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.u16 r3, q5[3] +; CHECK-NEXT: vmov.32 q0[3], r3 ; CHECK-NEXT: vmov.u16 r12, q6[0] ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmov.32 q1[0], r12 @@ -747,7 +748,7 @@ ; CHECK-NEXT: vmov.u16 r1, q6[1] ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: vmov.u16 r1, q6[2] ; CHECK-NEXT: vmov.32 q1[2], r1 ; CHECK-NEXT: vmov.u16 r1, q6[3] @@ -757,26 +758,26 @@ ; CHECK-NEXT: vmov r6, s11 ; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: ldrh.w r9, [r4] -; CHECK-NEXT: vmov.u16 r4, q5[4] -; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov.u16 r4, q5[5] -; CHECK-NEXT: vmov.32 q0[1], r4 -; CHECK-NEXT: vmov.u16 r4, q5[6] -; CHECK-NEXT: vmov.32 q0[2], r4 -; CHECK-NEXT: vmov.u16 r4, q5[7] -; CHECK-NEXT: vmov.32 q0[3], r4 +; CHECK-NEXT: ldrh.w r9, [r3] +; CHECK-NEXT: vmov.u16 r3, q5[4] +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.u16 r3, q5[6] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.u16 r3, q5[7] +; CHECK-NEXT: vmov.32 q0[3], r3 ; CHECK-NEXT: ldrh r6, [r6] ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: ldrh.w r10, [r4] -; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: ldrh.w r10, [r3] +; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh.w r11, [r4] -; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: ldrh.w r11, [r3] +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: vmov.32 q0[0], r7 ; CHECK-NEXT: vmov.u16 r7, q7[5] ; CHECK-NEXT: vmov.32 q0[1], r7 @@ -811,7 +812,7 @@ ; CHECK-NEXT: vshl.i32 q3, q3, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh r7, [r7] ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q1[0], r1 @@ -823,7 +824,7 @@ ; CHECK-NEXT: vmov.16 q1[3], r6 ; CHECK-NEXT: vmov.16 q1[4], r10 ; CHECK-NEXT: vmov.16 q1[5], r11 -; CHECK-NEXT: vmov.16 q1[6], r4 +; CHECK-NEXT: vmov.16 q1[6], r3 ; CHECK-NEXT: vmov.16 q1[7], r5 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q2[0], r1 @@ -877,12 +878,11 @@ ; CHECK-NEXT: vmov.16 q0[7], r1 ; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q0, [r3], #16 +; CHECK-NEXT: vstrb.8 q0, [r4], #16 ; CHECK-NEXT: le lr, .LBB12_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #56] @ 4-byte Reload -; CHECK-NEXT: cmp r1, r2 +; CHECK-NEXT: cmp r8, r2 ; CHECK-NEXT: bne.w .LBB12_2 ; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #104 @@ -892,6 +892,15 @@ ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: ; CHECK-NEXT: .LCPI12_0: +; CHECK-NEXT: .short 1 @ 0x1 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 22 @ 0x16 +; CHECK-NEXT: .LCPI12_1: ; CHECK-NEXT: .short 0 @ 0x0 ; CHECK-NEXT: .short 3 @ 0x3 ; CHECK-NEXT: .short 6 @ 0x6 @@ -900,7 +909,7 @@ ; CHECK-NEXT: .short 15 @ 0xf ; CHECK-NEXT: .short 18 @ 0x12 ; CHECK-NEXT: .short 21 @ 0x15 -; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .LCPI12_2: ; CHECK-NEXT: .short 2 @ 0x2 ; CHECK-NEXT: .short 5 @ 0x5 ; CHECK-NEXT: .short 8 @ 0x8 @@ -909,15 +918,6 @@ ; CHECK-NEXT: .short 17 @ 0x11 ; CHECK-NEXT: .short 20 @ 0x14 ; CHECK-NEXT: .short 23 @ 0x17 -; CHECK-NEXT: .LCPI12_2: -; CHECK-NEXT: .short 1 @ 0x1 -; CHECK-NEXT: .short 4 @ 0x4 -; CHECK-NEXT: .short 7 @ 0x7 -; CHECK-NEXT: .short 10 @ 0xa -; CHECK-NEXT: .short 13 @ 0xd -; CHECK-NEXT: .short 16 @ 0x10 -; CHECK-NEXT: .short 19 @ 0x13 -; CHECK-NEXT: .short 22 @ 0x16 entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -740,8 +740,8 @@ ; CHECK-NEXT: .LBB22_1: @ %vector.body.preheader ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r2, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r2, r3, r2, lsr #2 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: .LBB22_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 @@ -786,8 +786,8 @@ ; CHECK-NEXT: .LBB23_1: @ %vector.body.preheader ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r2, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r2, r3, r2, lsr #2 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: .LBB23_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -448,38 +448,38 @@ ; CHECK-NEXT: ldrd r9, r12, [sp, #128] ; CHECK-NEXT: sub.w r7, r12, #1 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: adr r5, .LCPI9_0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: add.w r7, r6, r7, lsr #1 ; CHECK-NEXT: vdup.32 q1, r9 ; CHECK-NEXT: bic r7, r7, #3 -; CHECK-NEXT: vldrw.u32 q2, [r5] -; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: vshl.i32 q3, q1, #3 -; CHECK-NEXT: add.w r7, r6, r7, lsr #2 +; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: add.w r10, r6, r7, lsr #2 +; CHECK-NEXT: adr r7, .LCPI9_0 ; CHECK-NEXT: adr r6, .LCPI9_1 +; CHECK-NEXT: vldrw.u32 q2, [r7] ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB9_2 Depth 2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 -; CHECK-NEXT: mul r10, r8, r9 +; CHECK-NEXT: mul r11, r8, r9 ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: mul r11, r8, r12 +; CHECK-NEXT: mul r7, r8, r12 ; CHECK-NEXT: .LBB9_2: @ %vector.ph ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 -; CHECK-NEXT: vdup.32 q5, r11 +; CHECK-NEXT: vdup.32 q5, r7 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: vshl.i32 q5, q5, #2 ; CHECK-NEXT: vmov q6, q1 ; CHECK-NEXT: vadd.i32 q5, q5, r0 +; CHECK-NEXT: dls lr, r10 ; CHECK-NEXT: vmov.i32 q4, #0x0 ; CHECK-NEXT: vadd.i32 q5, q5, q0 ; CHECK-NEXT: vmlas.u32 q6, q2, r5 -; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 @@ -493,11 +493,11 @@ ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2 -; CHECK-NEXT: add.w r6, r5, r10 +; CHECK-NEXT: add.w r4, r5, r11 ; CHECK-NEXT: adds r5, #1 -; CHECK-NEXT: vaddv.u32 r4, q4 +; CHECK-NEXT: vaddv.u32 r6, q4 ; CHECK-NEXT: cmp r5, r9 -; CHECK-NEXT: str.w r4, [r2, r6, lsl #2] +; CHECK-NEXT: str.w r6, [r2, r4, lsl #2] ; CHECK-NEXT: bne .LBB9_2 ; CHECK-NEXT: @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB9_1 Depth=1 @@ -596,7 +596,7 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: .pad #24 ; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: strd r0, r2, [sp, #16] @ 8-byte Folded Spill +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov r0, r3 @@ -610,38 +610,39 @@ ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader -; CHECK-NEXT: ldr.w r11, [sp, #116] +; CHECK-NEXT: ldr.w r9, [sp, #116] ; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: movs r1, #1 -; CHECK-NEXT: mov.w r9, #0 -; CHECK-NEXT: bic r10, r11, #3 +; CHECK-NEXT: mov r11, r2 +; CHECK-NEXT: bic r10, r9, #3 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: sub.w r0, r10, #4 -; CHECK-NEXT: add.w r8, r1, r0, lsr #2 +; CHECK-NEXT: add.w r0, r1, r0, lsr #2 ; CHECK-NEXT: ldr r1, [sp, #112] -; CHECK-NEXT: lsl.w r0, r11, #1 +; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: lsl.w r0, r9, #1 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: adr r0, .LCPI10_0 ; CHECK-NEXT: vdup.32 q4, r1 ; CHECK-NEXT: vldrw.u32 q5, [r0] ; CHECK-NEXT: lsls r4, r1, #1 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vshl.i32 q6, q4, #2 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: b .LBB10_5 ; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: add.w r0, r11, r12, lsl #1 ; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: add.w r0, r0, r12, lsl #1 ; CHECK-NEXT: bl __aeabi_memclr ; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: add r9, r11 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add r8, r9 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: add r1, r0 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: adds r1, #1 @@ -653,7 +654,7 @@ ; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 ; CHECK-NEXT: ldr r0, [sp, #112] -; CHECK-NEXT: cmp.w r11, #0 +; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: mul r12, r1, r0 ; CHECK-NEXT: beq .LBB10_3 @@ -663,31 +664,31 @@ ; CHECK-NEXT: b .LBB10_8 ; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #112] ; CHECK-NEXT: add.w r3, r1, r12 ; CHECK-NEXT: adds r1, #1 -; CHECK-NEXT: strh.w r2, [r0, r3, lsl #1] -; CHECK-NEXT: ldr r0, [sp, #112] ; CHECK-NEXT: cmp r1, r0 +; CHECK-NEXT: strh.w r2, [r11, r3, lsl #1] ; CHECK-NEXT: beq .LBB10_4 ; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 -; CHECK-NEXT: cmp.w r11, #3 +; CHECK-NEXT: cmp.w r9, #3 ; CHECK-NEXT: bhi .LBB10_10 ; CHECK-NEXT: @ %bb.9: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: b .LBB10_13 ; CHECK-NEXT: .LBB10_10: @ %vector.ph ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov q1, q4 -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vmlas.u32 q1, q5, r1 -; CHECK-NEXT: dls lr, r8 +; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: .LBB10_11: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 @@ -702,18 +703,18 @@ ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 ; CHECK-NEXT: vaddv.u32 r2, q0 -; CHECK-NEXT: cmp r10, r11 -; CHECK-NEXT: mov r5, r10 +; CHECK-NEXT: cmp r10, r9 +; CHECK-NEXT: mov r7, r10 ; CHECK-NEXT: beq .LBB10_7 ; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 ; CHECK-NEXT: ldr r0, [sp, #112] -; CHECK-NEXT: sub.w lr, r11, r5 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: mla r3, r0, r5, r1 -; CHECK-NEXT: add r5, r9 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: add.w r5, r8, r7 +; CHECK-NEXT: mla r3, r0, r7, r1 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: sub.w r7, r9, r7 ; CHECK-NEXT: add.w r5, r0, r5, lsl #1 +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: add.w r3, r6, r3, lsl #1 ; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 @@ -850,54 +851,54 @@ ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: ldrd r2, r7, [sp, #104] -; CHECK-NEXT: add.w r12, r7, #10 +; CHECK-NEXT: add.w r8, r7, #10 ; CHECK-NEXT: adr r7, .LCPI11_0 ; CHECK-NEXT: ldr r1, [sp, #96] ; CHECK-NEXT: vdup.32 q1, r2 ; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: mov.w r9, #0 -; CHECK-NEXT: mov.w r10, #11 +; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: mov.w r9, #6 +; CHECK-NEXT: movs r6, #11 ; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: .LBB11_1: @ %for.body10.i ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB11_2 Depth 2 ; CHECK-NEXT: @ Child Loop BB11_3 Depth 3 ; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB11_2: @ %for.cond22.preheader.i ; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB11_3 Depth 3 ; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 -; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: .LBB11_3: @ %for.body27.i ; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2 ; CHECK-NEXT: @ => This Loop Header: Depth=3 ; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 -; CHECK-NEXT: mov.w lr, #6 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: movs r5, #4 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r9 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: mov.w r11, #4 ; CHECK-NEXT: .LBB11_4: @ %for.body78.us.i ; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2 ; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 ; CHECK-NEXT: @ => This Loop Header: Depth=4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 -; CHECK-NEXT: mul r7, r5, r10 -; CHECK-NEXT: vdup.32 q3, r6 -; CHECK-NEXT: vdup.32 q2, r8 -; CHECK-NEXT: mov r11, r12 -; CHECK-NEXT: vadd.i32 q4, q0, r7 +; CHECK-NEXT: mul r4, r11, r6 +; CHECK-NEXT: vdup.32 q3, r5 +; CHECK-NEXT: vdup.32 q2, r7 +; CHECK-NEXT: vadd.i32 q4, q0, r4 ; CHECK-NEXT: vmla.u32 q3, q4, r2 -; CHECK-NEXT: adds r7, #113 -; CHECK-NEXT: vadd.i32 q4, q0, r7 +; CHECK-NEXT: adds r4, #113 +; CHECK-NEXT: vadd.i32 q4, q0, r4 +; CHECK-NEXT: mov r4, r8 ; CHECK-NEXT: vmla.u32 q2, q4, r2 ; CHECK-NEXT: .LBB11_5: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 @@ -908,36 +909,36 @@ ; CHECK-NEXT: vldrb.s32 q6, [r0, q2] ; CHECK-NEXT: vadd.i32 q5, q2, q1 ; CHECK-NEXT: vadd.i32 q4, q3, q1 -; CHECK-NEXT: subs.w r11, r11, #4 +; CHECK-NEXT: subs r4, #4 ; CHECK-NEXT: vadd.i32 q2, q6, r2 ; CHECK-NEXT: vldrb.s32 q6, [r1, q3] ; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vmlava.u32 r4, q2, q6 +; CHECK-NEXT: vmlava.u32 r12, q2, q6 ; CHECK-NEXT: vmov q2, q5 ; CHECK-NEXT: bne .LBB11_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4 -; CHECK-NEXT: adds r5, #1 +; CHECK-NEXT: add.w r11, r11, #1 ; CHECK-NEXT: le lr, .LBB11_4 ; CHECK-NEXT: @ %bb.7: @ %for.cond.cleanup77.i ; CHECK-NEXT: @ in Loop: Header=BB11_3 Depth=3 -; CHECK-NEXT: adds r6, #1 -; CHECK-NEXT: add.w r9, r9, #1 -; CHECK-NEXT: cmp r6, r2 +; CHECK-NEXT: adds r5, #1 +; CHECK-NEXT: add.w r10, r10, #1 +; CHECK-NEXT: cmp r5, r2 ; CHECK-NEXT: bne .LBB11_3 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup26.i ; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=2 -; CHECK-NEXT: add.w r8, r8, #1 -; CHECK-NEXT: cmp r8, r3 +; CHECK-NEXT: adds r7, #1 +; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: bne .LBB11_2 ; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup20.i ; CHECK-NEXT: @ in Loop: Header=BB11_1 Depth=1 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: ldr r7, [sp, #148] -; CHECK-NEXT: adds r6, #1 -; CHECK-NEXT: cmp r6, r7 +; CHECK-NEXT: adds r5, #1 +; CHECK-NEXT: cmp r5, r7 ; CHECK-NEXT: it eq -; CHECK-NEXT: moveq r6, #0 +; CHECK-NEXT: moveq r5, #0 ; CHECK-NEXT: b .LBB11_1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.10: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll @@ -7,10 +7,10 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r3, #249 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: adr r3, .LCPI0_0 -; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] @@ -57,12 +57,12 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: adr r1, .LCPI1_0 +; CHECK-NEXT: movs r1, #249 ; CHECK-NEXT: adr r3, .LCPI1_1 +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: adr r1, .LCPI1_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: mov.w lr, #249 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] @@ -115,10 +115,10 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r3, #249 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: adr r3, .LCPI2_0 -; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] @@ -170,12 +170,12 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r3, #249 ; CHECK-NEXT: adr.w r12, .LCPI3_0 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: adr r3, .LCPI3_1 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vldrw.u32 q1, [r12] -; CHECK-NEXT: mov.w lr, #249 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] @@ -237,10 +237,10 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r3, #249 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: adr r3, .LCPI4_0 -; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q1, [r0, q0] @@ -300,12 +300,12 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r3, #249 ; CHECK-NEXT: adr.w r12, .LCPI5_0 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: adr r3, .LCPI5_1 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vldrw.u32 q1, [r12] -; CHECK-NEXT: mov.w lr, #249 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q2, [r0, q0] @@ -383,10 +383,10 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r3, #249 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: adr r3, .LCPI6_0 -; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] @@ -433,12 +433,12 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: adr r1, .LCPI7_0 +; CHECK-NEXT: movs r1, #249 ; CHECK-NEXT: adr r3, .LCPI7_1 +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: adr r1, .LCPI7_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: mov.w lr, #249 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] @@ -492,12 +492,12 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov s0, r2 -; CHECK-NEXT: adr r3, .LCPI8_0 +; CHECK-NEXT: movs r3, #249 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: adr r3, .LCPI8_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] @@ -550,13 +550,13 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov s0, r2 -; CHECK-NEXT: adr r2, .LCPI9_0 +; CHECK-NEXT: movs r2, #249 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: adr r2, .LCPI9_0 ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: adr r2, .LCPI9_1 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -7,11 +7,11 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 +; CHECK-NEXT: movw r0, #1250 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: adr r0, .LCPI0_0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 ; CHECK-NEXT: .LBB0_1: @ %vector.body @@ -79,13 +79,13 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: add.w r4, r0, r3, lsl #2 +; CHECK-NEXT: movw r0, #1250 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: adr r0, .LCPI1_0 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: add.w r12, r3, #4 -; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vmov.i32 q0, #0x14 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 @@ -153,14 +153,14 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 +; CHECK-NEXT: movw r0, #1250 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: adr r0, .LCPI2_0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vmov.i32 q2, #0x3 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 @@ -239,9 +239,9 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: adr r6, .LCPI3_4 ; CHECK-NEXT: adr r5, .LCPI3_3 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 ; CHECK-NEXT: adr r4, .LCPI3_2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: vstrw.32 q0, [sp, #160] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] ; CHECK-NEXT: adr.w r8, .LCPI3_1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll @@ -20,9 +20,9 @@ ; CHECK-NEXT: .LBB0_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 +; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: .LBB0_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll b/llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll --- a/llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll +++ b/llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll @@ -9,26 +9,25 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB0_6 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck -; CHECK-NEXT: add.w r4, r2, r3, lsl #1 -; CHECK-NEXT: add.w r5, r1, r3, lsl #1 -; CHECK-NEXT: cmp r4, r1 +; CHECK-NEXT: add.w r5, r2, r3, lsl #1 +; CHECK-NEXT: add.w r4, r1, r3, lsl #1 +; CHECK-NEXT: cmp r5, r1 ; CHECK-NEXT: cset r12, hi -; CHECK-NEXT: cmp r5, r2 +; CHECK-NEXT: cmp r4, r2 ; CHECK-NEXT: cset lr, hi -; CHECK-NEXT: cmp r4, r0 +; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: add.w r5, r0, r3, lsl #1 ; CHECK-NEXT: cset r4, hi ; CHECK-NEXT: cmp r5, r2 ; CHECK-NEXT: cset r5, hi ; CHECK-NEXT: ands r4, r5 ; CHECK-NEXT: lsls r4, r4, #31 -; CHECK-NEXT: mov r4, r3 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r3, lr, r12 -; CHECK-NEXT: lslseq.w r3, r3, #31 +; CHECK-NEXT: andeq.w r5, lr, r12 +; CHECK-NEXT: lslseq.w r5, r5, #31 ; CHECK-NEXT: beq .LBB0_4 ; CHECK-NEXT: @ %bb.2: @ %while.body.preheader -; CHECK-NEXT: dls lr, r4 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB0_3: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r0] @@ -41,29 +40,14 @@ ; CHECK-NEXT: le lr, .LBB0_3 ; CHECK-NEXT: b .LBB0_6 ; CHECK-NEXT: .LBB0_4: @ %vector.ph -; CHECK-NEXT: adds r3, r4, #7 -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: bic r3, r3, #7 -; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: add.w r5, r5, r3, lsr #3 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: mov r5, r4 +; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB0_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov lr, r3 -; CHECK-NEXT: vctp.16 r5 -; CHECK-NEXT: sub.w lr, lr, #1 -; CHECK-NEXT: subs r5, #8 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrht.u16 q0, [r0], #16 -; CHECK-NEXT: vldrht.u16 q1, [r1], #16 -; CHECK-NEXT: mov r3, lr +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r1], #16 ; CHECK-NEXT: vadd.f16 q0, q1, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q0, [r2], #16 -; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: bne .LBB0_5 -; CHECK-NEXT: b .LBB0_6 +; CHECK-NEXT: vstrh.16 q0, [r2], #16 +; CHECK-NEXT: letp lr, .LBB0_5 ; CHECK-NEXT: .LBB0_6: @ %while.end ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -128,73 +112,68 @@ define void @notailpred(half* nocapture readonly %pSrcA, half* nocapture readonly %pSrcB, half* nocapture %pDst, i32 %blockSize) { ; CHECK-LABEL: notailpred: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r8, r9, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: cbz r3, .LBB1_6 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: cmp r3, #8 ; CHECK-NEXT: blo .LBB1_3 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck -; CHECK-NEXT: add.w r4, r2, r3, lsl #1 -; CHECK-NEXT: add.w r5, r1, r3, lsl #1 -; CHECK-NEXT: cmp r4, r1 -; CHECK-NEXT: add.w r6, r0, r3, lsl #1 -; CHECK-NEXT: cset r12, hi -; CHECK-NEXT: cmp r5, r2 -; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r0 -; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: add.w r5, r2, r3, lsl #1 +; CHECK-NEXT: add.w r6, r1, r3, lsl #1 +; CHECK-NEXT: cmp r5, r1 +; CHECK-NEXT: add.w r4, r0, r3, lsl #1 +; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r6, r2 ; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: ands r6, r4 -; CHECK-NEXT: lsls r6, r6, #31 +; CHECK-NEXT: cmp r5, r0 +; CHECK-NEXT: cset r5, hi +; CHECK-NEXT: cmp r4, r2 +; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: ands r5, r4 +; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r6, r5, r12 -; CHECK-NEXT: lslseq.w r6, r6, #31 +; CHECK-NEXT: andeq r7, r6 +; CHECK-NEXT: lslseq.w r7, r7, #31 ; CHECK-NEXT: beq .LBB1_7 ; CHECK-NEXT: .LBB1_3: -; CHECK-NEXT: mov lr, r3 +; CHECK-NEXT: mov r5, r3 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r7, r2 +; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: .LBB1_4: @ %while.body.preheader31 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r5 ; CHECK-NEXT: .LBB1_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r12] -; CHECK-NEXT: vldr.16 s2, [r5] -; CHECK-NEXT: adds r5, #2 +; CHECK-NEXT: vldr.16 s2, [r4] +; CHECK-NEXT: adds r4, #2 ; CHECK-NEXT: add.w r12, r12, #2 ; CHECK-NEXT: vadd.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r4] -; CHECK-NEXT: adds r4, #2 +; CHECK-NEXT: vstr.16 s0, [r7] +; CHECK-NEXT: adds r7, #2 ; CHECK-NEXT: le lr, .LBB1_5 ; CHECK-NEXT: .LBB1_6: @ %while.end -; CHECK-NEXT: pop.w {r4, r5, r6, r8, r9, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .LBB1_7: @ %vector.ph -; CHECK-NEXT: bic r8, r3, #7 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r5, r8, #8 -; CHECK-NEXT: and r9, r3, #7 -; CHECK-NEXT: add.w r12, r0, r8, lsl #1 -; CHECK-NEXT: add.w r5, r4, r5, lsr #3 -; CHECK-NEXT: add.w r4, r2, r8, lsl #1 -; CHECK-NEXT: mov r6, r5 -; CHECK-NEXT: add.w r5, r1, r8, lsl #1 +; CHECK-NEXT: bic r6, r3, #7 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: sub.w r7, r6, #8 +; CHECK-NEXT: add.w r4, r1, r6, lsl #1 +; CHECK-NEXT: add.w r12, r0, r6, lsl #1 +; CHECK-NEXT: add.w r5, r5, r7, lsr #3 +; CHECK-NEXT: add.w r7, r2, r6, lsl #1 +; CHECK-NEXT: dls lr, r5 +; CHECK-NEXT: and r5, r3, #7 ; CHECK-NEXT: .LBB1_8: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 -; CHECK-NEXT: mov lr, r6 ; CHECK-NEXT: vadd.f16 q0, q1, q0 -; CHECK-NEXT: subs.w lr, lr, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: mov r6, lr -; CHECK-NEXT: bne .LBB1_8 -; CHECK-NEXT: b .LBB1_9 -; CHECK-NEXT: .LBB1_9: @ %middle.block -; CHECK-NEXT: cmp r8, r3 -; CHECK-NEXT: mov lr, r9 +; CHECK-NEXT: le lr, .LBB1_8 +; CHECK-NEXT: @ %bb.9: @ %middle.block +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: bne .LBB1_4 ; CHECK-NEXT: b .LBB1_6 entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -13,36 +13,43 @@ ; CHECK-NEXT: cmp.w r12, #2 ; CHECK-NEXT: blo .LBB0_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr.w r9, [r0, #8] +; CHECK-NEXT: ldr r5, [r0, #8] ; CHECK-NEXT: ldr r3, [r0] -; CHECK-NEXT: add.w r3, r3, r9, lsl #2 +; CHECK-NEXT: adds r0, r5, #3 +; CHECK-NEXT: bic r0, r0, #3 +; CHECK-NEXT: add.w r4, r3, r5, lsl #2 +; CHECK-NEXT: subs r3, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsl.w r8, r9, #2 +; CHECK-NEXT: lsl.w r9, r5, #2 +; CHECK-NEXT: add.w r8, r0, r3, lsr #2 ; CHECK-NEXT: .LBB0_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 +; CHECK-NEXT: dls lr, r8 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r7, r3 -; CHECK-NEXT: mov r5, r9 -; CHECK-NEXT: dlstp.32 lr, r5 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r6, r5 ; CHECK-NEXT: .LBB0_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q1, [r6], #16 -; CHECK-NEXT: vldrw.u32 q2, [r7], #16 -; CHECK-NEXT: vfma.f32 q0, q2, q1 -; CHECK-NEXT: letp lr, .LBB0_3 +; CHECK-NEXT: vctp.32 r6 +; CHECK-NEXT: subs r6, #4 +; CHECK-NEXT: vpsttt +; CHECK-NEXT: vldrwt.u32 q1, [r7], #16 +; CHECK-NEXT: vldrwt.u32 q2, [r3], #16 +; CHECK-NEXT: vfmat.f32 q0, q2, q1 +; CHECK-NEXT: le lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: vadd.f32 s4, s2, s3 -; CHECK-NEXT: add.w r7, r2, r0, lsl #2 +; CHECK-NEXT: add.w r3, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: add r3, r8 +; CHECK-NEXT: add r4, r9 ; CHECK-NEXT: cmp r0, r12 ; CHECK-NEXT: vadd.f32 s0, s0, s4 -; CHECK-NEXT: vstr s0, [r7] +; CHECK-NEXT: vstr s0, [r3] ; CHECK-NEXT: bne .LBB0_2 ; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} @@ -115,37 +122,45 @@ ; CHECK-NEXT: ldr.w r12, [r0, #8] ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: ldr r3, [r0] -; CHECK-NEXT: add.w r5, r3, r12, lsl #2 +; CHECK-NEXT: add.w r0, r12, #3 +; CHECK-NEXT: bic r0, r0, #3 +; CHECK-NEXT: add.w r6, r3, r12, lsl #2 +; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: add.w r7, r3, r12, lsl #3 -; CHECK-NEXT: lsl.w r8, r12, #3 +; CHECK-NEXT: lsl.w r10, r12, #3 +; CHECK-NEXT: add.w r8, r4, r0, lsr #2 ; CHECK-NEXT: .LBB1_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 +; CHECK-NEXT: dls lr, r8 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r10, r4, #1 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w r11, r4, #1 +; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: dlstp.32 lr, r12 -; CHECK-NEXT: mov r11, r12 +; CHECK-NEXT: mov r9, r12 ; CHECK-NEXT: .LBB1_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q2, [r6], #16 -; CHECK-NEXT: vldrw.u32 q3, [r3], #16 -; CHECK-NEXT: vfma.f32 q1, q3, q2 -; CHECK-NEXT: vldrw.u32 q3, [r0], #16 -; CHECK-NEXT: vfma.f32 q0, q3, q2 -; CHECK-NEXT: letp lr, .LBB1_3 +; CHECK-NEXT: vctp.32 r9 +; CHECK-NEXT: sub.w r9, r9, #4 +; CHECK-NEXT: vpstttt +; CHECK-NEXT: vldrwt.u32 q2, [r5], #16 +; CHECK-NEXT: vldrwt.u32 q3, [r3], #16 +; CHECK-NEXT: vfmat.f32 q1, q3, q2 +; CHECK-NEXT: vldrwt.u32 q3, [r0], #16 +; CHECK-NEXT: vpst +; CHECK-NEXT: vfmat.f32 q0, q3, q2 +; CHECK-NEXT: le lr, .LBB1_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: vadd.f32 s8, s2, s3 -; CHECK-NEXT: add.w r0, r2, r10, lsl #2 +; CHECK-NEXT: add.w r0, r2, r11, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: add r5, r8 +; CHECK-NEXT: add r6, r10 ; CHECK-NEXT: vadd.f32 s2, s6, s7 -; CHECK-NEXT: add r7, r8 +; CHECK-NEXT: add r7, r10 ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: vadd.f32 s0, s0, s8 ; CHECK-NEXT: vadd.f32 s2, s4, s2 @@ -242,40 +257,40 @@ ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo .LBB2_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr r7, [r0, #8] ; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r0, r3, r3, lsl #1 -; CHECK-NEXT: add.w r7, r1, r3, lsl #2 -; CHECK-NEXT: add.w r12, r1, r3, lsl #3 -; CHECK-NEXT: adds r3, #3 +; CHECK-NEXT: ldr r3, [r0] +; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r7, r7, lsl #1 +; CHECK-NEXT: add.w r12, r3, r7, lsl #2 +; CHECK-NEXT: add.w r1, r3, r7, lsl #3 +; CHECK-NEXT: add.w r8, r3, r0, lsl #2 +; CHECK-NEXT: adds r3, r7, #3 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: add.w r1, r1, r0, lsl #2 +; CHECK-NEXT: lsls r7, r0, #2 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: lsl.w r11, r0, #2 ; CHECK-NEXT: add.w r3, r5, r3, lsr #2 ; CHECK-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 -; CHECK-NEXT: ldr.w lr, [sp] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: add.w r9, r5, #2 -; CHECK-NEXT: add.w r10, r5, #1 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: add.w r11, r5, #1 +; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: ldr.w r8, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: ldr.w r10, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r4, r8 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: .LBB2_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.32 r8 -; CHECK-NEXT: sub.w r8, r8, #4 +; CHECK-NEXT: vctp.32 r10 +; CHECK-NEXT: sub.w r10, r10, #4 ; CHECK-NEXT: vpstttt ; CHECK-NEXT: vldrwt.u32 q3, [r6], #16 ; CHECK-NEXT: vldrwt.u32 q4, [r3], #16 @@ -289,13 +304,13 @@ ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1 ; CHECK-NEXT: vadd.f32 s12, s10, s11 -; CHECK-NEXT: add.w r0, r2, r10, lsl #2 +; CHECK-NEXT: add.w r0, r2, r11, lsl #2 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: add r7, r11 +; CHECK-NEXT: add r12, r7 ; CHECK-NEXT: vadd.f32 s10, s6, s7 -; CHECK-NEXT: add r12, r11 +; CHECK-NEXT: add r1, r7 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: add r1, r11 +; CHECK-NEXT: add r8, r7 ; CHECK-NEXT: vadd.f32 s6, s2, s3 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s2, s8, s12 @@ -416,10 +431,10 @@ ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: ldr r1, [r0] ; CHECK-NEXT: add.w r0, r3, r3, lsl #1 -; CHECK-NEXT: add.w r12, r1, r3, lsl #2 -; CHECK-NEXT: add.w r10, r1, r3, lsl #3 -; CHECK-NEXT: add.w r9, r1, r3, lsl #4 -; CHECK-NEXT: add.w r8, r1, r0, lsl #2 +; CHECK-NEXT: add.w r8, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r1, r3, lsl #3 +; CHECK-NEXT: add.w r10, r1, r3, lsl #4 +; CHECK-NEXT: add.w r9, r1, r0, lsl #2 ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: bic r0, r0, #3 ; CHECK-NEXT: lsls r7, r3, #4 @@ -429,22 +444,22 @@ ; CHECK-NEXT: .LBB3_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 -; CHECK-NEXT: ldr.w lr, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: adds r0, r6, #3 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: adds r0, r6, #3 ; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r0, r6, #2 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: mov r4, r10 ; CHECK-NEXT: ldr.w r11, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: adds r0, r6, #1 ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r4, r9 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: .LBB3_3: @ %vector.body @@ -470,15 +485,15 @@ ; CHECK-NEXT: vadd.f32 s16, s14, s15 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: add r12, r7 +; CHECK-NEXT: add r8, r7 ; CHECK-NEXT: vadd.f32 s14, s10, s11 -; CHECK-NEXT: add r10, r7 +; CHECK-NEXT: add r12, r7 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: add.w r0, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s10, s6, s7 -; CHECK-NEXT: add r8, r7 -; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: add r9, r7 +; CHECK-NEXT: vadd.f32 s4, s4, s5 +; CHECK-NEXT: add r10, r7 ; CHECK-NEXT: vadd.f32 s6, s2, s3 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s2, s12, s16 @@ -618,7 +633,7 @@ ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: add.w r8, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 @@ -630,51 +645,51 @@ ; CHECK-NEXT: .LBB4_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB4_3 Depth 2 -; CHECK-NEXT: ldr.w lr, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: add.w r10, r0, #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: adds r7, r0, #1 +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: ldr r6, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add.w r11, r0, #1 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: ldr.w r11, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: .LBB4_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r8, r3, r5 -; CHECK-NEXT: vctp.32 r1 +; CHECK-NEXT: add.w r9, r3, r5 +; CHECK-NEXT: vctp.32 r11 ; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q5, [r6], #16 +; CHECK-NEXT: vldrwt.u32 q5, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q6, [r3], #16 ; CHECK-NEXT: vfmat.f32 q3, q6, q5 -; CHECK-NEXT: add.w r9, r8, r5 +; CHECK-NEXT: add.w r12, r9, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q6, [r8] +; CHECK-NEXT: vldrwt.u32 q6, [r9] ; CHECK-NEXT: vfmat.f32 q4, q6, q5 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w r4, r9, r5 +; CHECK-NEXT: sub.w r11, r11, #4 +; CHECK-NEXT: add.w r4, r12, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q6, [r9] +; CHECK-NEXT: vldrwt.u32 q6, [r12] ; CHECK-NEXT: vfmat.f32 q2, q6, q5 -; CHECK-NEXT: adds r7, r4, r5 +; CHECK-NEXT: adds r6, r4, r5 ; CHECK-NEXT: vpstttt ; CHECK-NEXT: vldrwt.u32 q6, [r4] ; CHECK-NEXT: vfmat.f32 q0, q6, q5 -; CHECK-NEXT: vldrwt.u32 q6, [r7] +; CHECK-NEXT: vldrwt.u32 q6, [r6] ; CHECK-NEXT: vfmat.f32 q1, q6, q5 ; CHECK-NEXT: le lr, .LBB4_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1 ; CHECK-NEXT: vadd.f32 s20, s18, s19 -; CHECK-NEXT: add.w r1, r2, r11, lsl #2 +; CHECK-NEXT: add.w r1, r2, r7, lsl #2 ; CHECK-NEXT: vadd.f32 s16, s16, s17 ; CHECK-NEXT: vadd.f32 s18, s14, s15 ; CHECK-NEXT: vadd.f32 s12, s12, s13 @@ -702,7 +717,7 @@ ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add r12, r1 +; CHECK-NEXT: add r8, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB4_2 @@ -839,7 +854,7 @@ ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r8, r1, r3, lsl #2 +; CHECK-NEXT: add.w r9, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 @@ -851,21 +866,21 @@ ; CHECK-NEXT: .LBB5_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_3 Depth 2 -; CHECK-NEXT: ldr.w lr, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: adds r1, r0, #5 +; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: add.w r11, r0, #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: adds r4, r0, #1 +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: adds r1, r0, #5 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: adds r6, r0, #1 -; CHECK-NEXT: ldr.w r12, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: ldr.w r8, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov q5, q1 @@ -873,36 +888,36 @@ ; CHECK-NEXT: .LBB5_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r9, r3, r5 -; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: add.w r12, r3, r5 +; CHECK-NEXT: vctp.32 r8 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrwt.u32 q6, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q7, [r3], #16 ; CHECK-NEXT: vfmat.f32 q4, q7, q6 -; CHECK-NEXT: add.w r10, r9, r5 +; CHECK-NEXT: add.w r10, r12, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q7, [r9] +; CHECK-NEXT: vldrwt.u32 q7, [r12] ; CHECK-NEXT: vfmat.f32 q5, q7, q6 -; CHECK-NEXT: add.w r4, r10, r5 +; CHECK-NEXT: add.w r6, r10, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q7, [r10] ; CHECK-NEXT: vfmat.f32 q2, q7, q6 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: adds r7, r4, r5 +; CHECK-NEXT: sub.w r8, r8, #4 +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q7, [r4] +; CHECK-NEXT: vldrwt.u32 q7, [r6] ; CHECK-NEXT: vfmat.f32 q0, q7, q6 -; CHECK-NEXT: adds r4, r7, r5 +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstttt ; CHECK-NEXT: vldrwt.u32 q7, [r7] ; CHECK-NEXT: vfmat.f32 q3, q7, q6 -; CHECK-NEXT: vldrwt.u32 q7, [r4] +; CHECK-NEXT: vldrwt.u32 q7, [r6] ; CHECK-NEXT: vfmat.f32 q1, q7, q6 ; CHECK-NEXT: le lr, .LBB5_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1 ; CHECK-NEXT: vadd.f32 s24, s22, s23 -; CHECK-NEXT: add.w r1, r2, r6, lsl #2 +; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vadd.f32 s20, s20, s21 ; CHECK-NEXT: vadd.f32 s22, s18, s19 ; CHECK-NEXT: vadd.f32 s16, s16, s17 @@ -936,7 +951,7 @@ ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: add r8, r1 +; CHECK-NEXT: add r9, r1 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB5_2 @@ -1098,11 +1113,12 @@ ; CHECK-NEXT: .LBB6_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_3 Depth 2 -; CHECK-NEXT: ldr.w lr, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: adds r1, r0, #6 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: adds r6, r0, #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: adds r4, r0, #2 +; CHECK-NEXT: add.w r8, r0, #1 +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: adds r1, r0, #6 ; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #5 ; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill @@ -1110,11 +1126,10 @@ ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: add.w r8, r0, #1 -; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov q5, q2 ; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vmov q6, q2 @@ -1124,9 +1139,9 @@ ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: add.w r10, r3, r5 -; CHECK-NEXT: vctp.32 r9 +; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q7, [r1], #16 +; CHECK-NEXT: vldrwt.u32 q7, [r9], #16 ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 ; CHECK-NEXT: vfmat.f32 q5, q0, q7 ; CHECK-NEXT: add.w r11, r10, r5 @@ -1137,16 +1152,16 @@ ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r11] ; CHECK-NEXT: vfmat.f32 q1, q0, q7 -; CHECK-NEXT: add.w r4, r11, r5 +; CHECK-NEXT: add.w r6, r11, r5 ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vmov q5, q4 ; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: vmov q2, q3 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r4] +; CHECK-NEXT: vldrwt.u32 q0, [r6] ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: adds r7, r4, r5 +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q1, q0, q7 ; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill @@ -1156,14 +1171,14 @@ ; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: sub.w r9, r9, #4 -; CHECK-NEXT: adds r4, r7, r5 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r7] ; CHECK-NEXT: vfmat.f32 q3, q0, q7 -; CHECK-NEXT: adds r7, r4, r5 +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q0, [r4] +; CHECK-NEXT: vldrwt.u32 q0, [r6] ; CHECK-NEXT: vfmat.f32 q4, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r7] ; CHECK-NEXT: vfmat.f32 q2, q0, q7 @@ -1195,7 +1210,7 @@ ; CHECK-NEXT: adds r0, #7 ; CHECK-NEXT: vadd.f32 s10, s9, s10 ; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: add.w r1, r2, r6, lsl #2 +; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vadd.f32 s8, s8, s20 ; CHECK-NEXT: vadd.f32 s6, s5, s6 ; CHECK-NEXT: vstr s4, [r1] @@ -1386,11 +1401,12 @@ ; CHECK-NEXT: .LBB7_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB7_3 Depth 2 -; CHECK-NEXT: ldr.w lr, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: adds r4, r0, #3 +; CHECK-NEXT: add.w r8, r0, #2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: adds r1, r0, #7 -; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: adds r6, r0, #3 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #6 ; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill @@ -1398,18 +1414,17 @@ ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: ldr.w r12, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: ldr.w r10, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add.w r8, r0, #2 +; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #1 -; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov q7, q2 -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov q6, q3 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: .LBB7_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -1419,95 +1434,95 @@ ; CHECK-NEXT: vldrwt.u32 q0, [r12], #16 ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vfmat.f32 q6, q1, q0 -; CHECK-NEXT: add.w r4, r11, r5 +; CHECK-NEXT: add.w r6, r11, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r11] ; CHECK-NEXT: vfmat.f32 q7, q1, q0 ; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vmov q5, q4 -; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [r4] -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: adds r7, r4, r5 +; CHECK-NEXT: vldrwt.u32 q1, [r6] +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q3, q1, q0 -; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vfmat.f32 q2, q1, q0 +; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q1, [r7] -; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q3, q1, q0 -; CHECK-NEXT: adds r4, r7, r5 -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vfmat.f32 q2, q1, q0 +; CHECK-NEXT: adds r6, r7, r5 +; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmov q4, q5 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov q3, q5 ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: vmov q6, q7 ; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: adds r7, r4, r5 +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r4] -; CHECK-NEXT: vfmat.f32 q3, q1, q0 +; CHECK-NEXT: vldrwt.u32 q1, [r6] +; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: sub.w r10, r10, #4 -; CHECK-NEXT: adds r4, r7, r5 +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstttt ; CHECK-NEXT: vldrwt.u32 q1, [r7] ; CHECK-NEXT: vfmat.f32 q4, q1, q0 -; CHECK-NEXT: vldrwt.u32 q1, [r4] +; CHECK-NEXT: vldrwt.u32 q1, [r6] ; CHECK-NEXT: vfmat.f32 q5, q1, q0 -; CHECK-NEXT: add r4, r5 +; CHECK-NEXT: add r6, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r4] -; CHECK-NEXT: vfmat.f32 q2, q1, q0 +; CHECK-NEXT: vldrwt.u32 q1, [r6] +; CHECK-NEXT: vfmat.f32 q3, q1, q0 ; CHECK-NEXT: le lr, .LBB7_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1 ; CHECK-NEXT: vadd.f32 s0, s30, s31 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vadd.f32 s2, s28, s29 -; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s5, s10, s11 +; CHECK-NEXT: vadd.f32 s12, s12, s13 +; CHECK-NEXT: vadd.f32 s5, s14, s15 ; CHECK-NEXT: vadd.f32 s4, s26, s27 ; CHECK-NEXT: vadd.f32 s6, s24, s25 -; CHECK-NEXT: vadd.f32 s10, s18, s19 +; CHECK-NEXT: vadd.f32 s14, s18, s19 ; CHECK-NEXT: vadd.f32 s7, s16, s17 ; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 s9, s14, s15 -; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: vadd.f32 s14, s18, s19 -; CHECK-NEXT: vadd.f32 s11, s16, s17 +; CHECK-NEXT: vadd.f32 s8, s8, s9 +; CHECK-NEXT: vadd.f32 s13, s10, s11 +; CHECK-NEXT: vadd.f32 s10, s18, s19 +; CHECK-NEXT: vadd.f32 s9, s16, s17 ; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vadd.f32 s13, s18, s19 +; CHECK-NEXT: vadd.f32 s11, s18, s19 ; CHECK-NEXT: vadd.f32 s15, s16, s17 ; CHECK-NEXT: vadd.f32 s2, s6, s4 -; CHECK-NEXT: vadd.f32 s6, s8, s5 -; CHECK-NEXT: vadd.f32 s8, s7, s10 -; CHECK-NEXT: vadd.f32 s10, s12, s9 -; CHECK-NEXT: vadd.f32 s12, s11, s14 +; CHECK-NEXT: vadd.f32 s6, s12, s5 +; CHECK-NEXT: vadd.f32 s12, s7, s14 +; CHECK-NEXT: vadd.f32 s10, s9, s10 ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s1, s22, s23 -; CHECK-NEXT: vadd.f32 s14, s15, s13 +; CHECK-NEXT: vadd.f32 s8, s8, s13 ; CHECK-NEXT: adds r0, #8 +; CHECK-NEXT: vadd.f32 s14, s15, s11 ; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: add.w r1, r2, r8, lsl #2 +; CHECK-NEXT: vadd.f32 s1, s22, s23 ; CHECK-NEXT: vadd.f32 s3, s20, s21 -; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: add.w r1, r2, r6, lsl #2 +; CHECK-NEXT: vstr s10, [r1] +; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vstr s14, [r1] ; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s4, s3, s1 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s10, [r1] +; CHECK-NEXT: vstr s8, [r1] ; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s8, [r1] +; CHECK-NEXT: vstr s12, [r1] ; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll @@ -8,18 +8,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: mov lr, r1 ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB0_4 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #32 +; CHECK-NEXT: vldrw.u32 q0, [r2], #32 ; CHECK-NEXT: vaddva.s32 r0, q0 -; CHECK-NEXT: vldrw.u32 q0, [r1, #-16] +; CHECK-NEXT: vldrw.u32 q0, [r2, #-16] ; CHECK-NEXT: vaddva.s32 r0, q0 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup @@ -283,10 +282,10 @@ ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #8 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: add.w lr, r5, r6, lsr #3 +; CHECK-NEXT: add.w r6, r5, r6, lsr #3 ; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r4, #16] @@ -305,11 +304,11 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB2_6: @ %for.body.preheader12 -; CHECK-NEXT: sub.w lr, r3, r12 +; CHECK-NEXT: sub.w r3, r3, r12 ; CHECK-NEXT: add.w r0, r0, r12, lsl #2 ; CHECK-NEXT: add.w r1, r1, r12, lsl #2 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB2_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr s0, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -22,10 +22,10 @@ ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: add.w lr, r5, r6, lsr #2 +; CHECK-NEXT: add.w r6, r5, r6, lsr #2 ; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r4], #16 @@ -39,11 +39,11 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader12 -; CHECK-NEXT: sub.w lr, r3, r12 +; CHECK-NEXT: sub.w r3, r3, r12 ; CHECK-NEXT: add.w r0, r0, r12, lsl #2 ; CHECK-NEXT: add.w r1, r1, r12, lsl #2 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr s0, [r0] @@ -134,11 +134,11 @@ ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: add.w lr, lr, r12, lsr #2 +; CHECK-NEXT: add.w r12, lr, r12, lsr #2 +; CHECK-NEXT: dls lr, r12 ; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vdup.32 q2, r3 @@ -218,9 +218,9 @@ ; CHECK-NEXT: beq .LBB2_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph ; CHECK-NEXT: ldr r3, [sp, #64] -; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr.w r9, [sp, #56] +; CHECK-NEXT: ldr.w r11, [sp, #56] ; CHECK-NEXT: add.w r0, r1, r3, lsl #1 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: adds r0, r1, r3 @@ -232,54 +232,54 @@ ; CHECK-NEXT: lsrs r0, r0, #3 ; CHECK-NEXT: b .LBB2_5 ; CHECK-NEXT: .LBB2_3: @ in Loop: Header=BB2_5 Depth=1 -; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: .LBB2_4: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: ldr r3, [sp, #72] -; CHECK-NEXT: add.w r1, r10, r8 +; CHECK-NEXT: add.w r1, r8, r10 ; CHECK-NEXT: add r1, r6 ; CHECK-NEXT: add r1, r12 -; CHECK-NEXT: strb.w r1, [r3, r11] -; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: cmp r11, r2 +; CHECK-NEXT: strb.w r1, [r3, r9] +; CHECK-NEXT: add.w r9, r9, #1 +; CHECK-NEXT: cmp r9, r2 ; CHECK-NEXT: beq .LBB2_8 ; CHECK-NEXT: .LBB2_5: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB2_7 Depth 2 ; CHECK-NEXT: ldr r1, [sp, #68] -; CHECK-NEXT: subs.w lr, r0, r0 -; CHECK-NEXT: ldr.w r12, [r1, r11, lsl #2] +; CHECK-NEXT: ldr.w r12, [r1, r9, lsl #2] +; CHECK-NEXT: subs r1, r0, r0 ; CHECK-NEXT: ble .LBB2_3 ; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #64] +; CHECK-NEXT: ldr r7, [sp, #64] ; CHECK-NEXT: mov r6, r12 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r10, r12 -; CHECK-NEXT: mla r7, r11, r3, r1 +; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldrd r4, r3, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: mla r7, r9, r7, r3 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldrd r4, r3, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: .LBB2_7: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB2_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrb.s16 q0, [r4], #8 -; CHECK-NEXT: vadd.i16 q1, q0, r9 +; CHECK-NEXT: vadd.i16 q1, q0, r11 ; CHECK-NEXT: vldrb.s16 q0, [r7], #8 ; CHECK-NEXT: vmlava.s16 r12, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r9 +; CHECK-NEXT: vadd.i16 q1, q1, r11 ; CHECK-NEXT: vmlava.s16 r6, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r3], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r9 -; CHECK-NEXT: vmlava.s16 r10, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r9 +; CHECK-NEXT: vadd.i16 q1, q1, r11 ; CHECK-NEXT: vmlava.s16 r8, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r1], #8 +; CHECK-NEXT: vadd.i16 q1, q1, r11 +; CHECK-NEXT: vmlava.s16 r10, q0, q1 ; CHECK-NEXT: le lr, .LBB2_7 ; CHECK-NEXT: b .LBB2_4 ; CHECK-NEXT: .LBB2_8: @ %if.end @@ -401,9 +401,9 @@ ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph ; CHECK-NEXT: ldr r3, [sp, #64] -; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr.w r9, [sp, #56] +; CHECK-NEXT: ldr.w r11, [sp, #56] ; CHECK-NEXT: add.w r0, r1, r3, lsl #1 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: adds r0, r1, r3 @@ -417,52 +417,52 @@ ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_5 Depth 2 ; CHECK-NEXT: ldr r1, [sp, #68] -; CHECK-NEXT: subs.w lr, r0, r0 -; CHECK-NEXT: ldr.w r12, [r1, r11, lsl #2] +; CHECK-NEXT: ldr.w r12, [r1, r9, lsl #2] +; CHECK-NEXT: subs r1, r0, r0 ; CHECK-NEXT: ble .LBB3_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB3_3 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #64] +; CHECK-NEXT: ldr r7, [sp, #64] ; CHECK-NEXT: mov r6, r12 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r10, r12 -; CHECK-NEXT: mla r7, r11, r3, r1 +; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldrd r4, r3, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: mla r7, r9, r7, r3 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldrd r4, r3, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: .LBB3_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB3_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrb.s16 q0, [r4], #8 -; CHECK-NEXT: vadd.i16 q1, q0, r9 +; CHECK-NEXT: vadd.i16 q1, q0, r11 ; CHECK-NEXT: vldrb.s16 q0, [r7], #8 ; CHECK-NEXT: vmlava.s16 r12, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r9 +; CHECK-NEXT: vadd.i16 q1, q1, r11 ; CHECK-NEXT: vmlava.s16 r6, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r3], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r9 -; CHECK-NEXT: vmlava.s16 r10, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r9 +; CHECK-NEXT: vadd.i16 q1, q1, r11 ; CHECK-NEXT: vmlava.s16 r8, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r1], #8 +; CHECK-NEXT: vadd.i16 q1, q1, r11 +; CHECK-NEXT: vmlava.s16 r10, q0, q1 ; CHECK-NEXT: le lr, .LBB3_5 ; CHECK-NEXT: b .LBB3_7 ; CHECK-NEXT: .LBB3_6: @ in Loop: Header=BB3_3 Depth=1 -; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: .LBB3_7: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB3_3 Depth=1 ; CHECK-NEXT: ldr r3, [sp, #72] -; CHECK-NEXT: add.w r1, r10, r8 +; CHECK-NEXT: add.w r1, r8, r10 ; CHECK-NEXT: add r1, r6 ; CHECK-NEXT: add r1, r12 -; CHECK-NEXT: strb.w r1, [r3, r11] -; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: cmp r11, r2 +; CHECK-NEXT: strb.w r1, [r3, r9] +; CHECK-NEXT: add.w r9, r9, #1 +; CHECK-NEXT: cmp r9, r2 ; CHECK-NEXT: bne .LBB3_3 ; CHECK-NEXT: .LBB3_8: @ %if.end ; CHECK-NEXT: ldr r0, [sp, #72] @@ -574,27 +574,35 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r10, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r10, lr} +; CHECK-NEXT: add.w r7, r0, #15 ; CHECK-NEXT: ldr.w r12, [sp, #32] +; CHECK-NEXT: asrs r6, r7, #31 +; CHECK-NEXT: add.w r7, r7, r6, lsr #28 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: asrs r5, r7, #4 +; CHECK-NEXT: cmp r5, #1 +; CHECK-NEXT: it gt +; CHECK-NEXT: asrgt r6, r7, #4 ; CHECK-NEXT: cmp r0, #1 ; CHECK-NEXT: blt .LBB4_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: adds r5, r2, r1 -; CHECK-NEXT: add.w r7, r2, r1, lsl #1 +; CHECK-NEXT: adds r7, r2, r1 +; CHECK-NEXT: add.w r5, r2, r1, lsl #1 ; CHECK-NEXT: add.w r1, r1, r1, lsl #1 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: dlstp.8 lr, r0 ; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: dlstp.8 lr, r0 ; CHECK-NEXT: .LBB4_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q0, [r3], #16 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16 ; CHECK-NEXT: vmlava.s8 r10, q1, q0 -; CHECK-NEXT: vldrb.u8 q1, [r7], #16 -; CHECK-NEXT: vmlava.s8 r4, q1, q0 ; CHECK-NEXT: vldrb.u8 q1, [r5], #16 +; CHECK-NEXT: vmlava.s8 r4, q1, q0 +; CHECK-NEXT: vldrb.u8 q1, [r7], #16 ; CHECK-NEXT: vmlava.s8 r6, q1, q0 ; CHECK-NEXT: vldrb.u8 q1, [r2], #16 ; CHECK-NEXT: vmlava.s8 r8, q1, q0 @@ -709,12 +717,12 @@ ; CHECK-NEXT: lsrs r2, r0, #3 ; CHECK-NEXT: b .LBB5_5 ; CHECK-NEXT: .LBB5_3: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: .LBB5_4: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: add.w r0, r10, r8 +; CHECK-NEXT: add.w r0, r8, r10 ; CHECK-NEXT: ldr r1, [sp, #96] ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 @@ -727,21 +735,21 @@ ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_7 Depth 2 ; CHECK-NEXT: ldr r0, [sp, #92] -; CHECK-NEXT: subs.w lr, r2, r2 ; CHECK-NEXT: ldr.w r12, [r0, r9, lsl #2] +; CHECK-NEXT: subs r0, r2, r2 ; CHECK-NEXT: ble .LBB5_3 ; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 ; CHECK-NEXT: ldr.w r11, [sp, #88] ; CHECK-NEXT: mov r6, r12 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: dlstp.16 lr, r11 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r10, r12 -; CHECK-NEXT: mla r3, r9, r11, r0 ; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: mla r3, r9, r11, r1 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: .LBB5_7: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB5_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -754,10 +762,10 @@ ; CHECK-NEXT: vmlava.s16 r6, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r0], #8 ; CHECK-NEXT: vadd.i16 q1, q1, r4 -; CHECK-NEXT: vmlava.s16 r10, q0, q1 +; CHECK-NEXT: vmlava.s16 r8, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r1], #8 ; CHECK-NEXT: vadd.i16 q1, q1, r4 -; CHECK-NEXT: vmlava.s16 r8, q0, q1 +; CHECK-NEXT: vmlava.s16 r10, q0, q1 ; CHECK-NEXT: letp lr, .LBB5_7 ; CHECK-NEXT: b .LBB5_4 ; CHECK-NEXT: .LBB5_8: @ %if.end @@ -899,21 +907,21 @@ ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_5 Depth 2 ; CHECK-NEXT: ldr r0, [sp, #92] -; CHECK-NEXT: subs.w lr, r2, r2 ; CHECK-NEXT: ldr.w r12, [r0, r9, lsl #2] +; CHECK-NEXT: subs r0, r2, r2 ; CHECK-NEXT: ble .LBB6_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 ; CHECK-NEXT: ldr.w r11, [sp, #88] ; CHECK-NEXT: mov r6, r12 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: dlstp.16 lr, r11 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r10, r12 -; CHECK-NEXT: mla r3, r9, r11, r0 ; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: mla r3, r9, r11, r1 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: .LBB6_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB6_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -926,19 +934,19 @@ ; CHECK-NEXT: vmlava.s16 r6, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r0], #8 ; CHECK-NEXT: vadd.i16 q1, q1, r4 -; CHECK-NEXT: vmlava.s16 r10, q0, q1 +; CHECK-NEXT: vmlava.s16 r8, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r1], #8 ; CHECK-NEXT: vadd.i16 q1, q1, r4 -; CHECK-NEXT: vmlava.s16 r8, q0, q1 +; CHECK-NEXT: vmlava.s16 r10, q0, q1 ; CHECK-NEXT: letp lr, .LBB6_5 ; CHECK-NEXT: b .LBB6_7 ; CHECK-NEXT: .LBB6_6: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: add.w r0, r10, r8 +; CHECK-NEXT: add.w r0, r8, r10 ; CHECK-NEXT: ldr r1, [sp, #96] ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 @@ -1103,7 +1111,7 @@ ; CHECK-NEXT: lsrs r2, r1, #3 ; CHECK-NEXT: lsls r1, r1, #1 ; CHECK-NEXT: str r2, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: str r3, [sp, #32] @ 4-byte Spill @@ -1111,22 +1119,22 @@ ; CHECK-NEXT: .LBB7_6: @ Parent Loop BB7_3 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB7_7 Depth 3 -; CHECK-NEXT: add.w r12, r0, #16 -; CHECK-NEXT: ldr r4, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: ldr.w lr, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: ldm.w r12, {r1, r2, r3, r12} -; CHECK-NEXT: muls r4, r5, r4 -; CHECK-NEXT: ldr.w r2, [r2, r10, lsl #2] -; CHECK-NEXT: ldr.w r1, [r1, r10, lsl #2] -; CHECK-NEXT: ldrd r6, r7, [r0, #32] +; CHECK-NEXT: ldrd r3, lr, [r0, #24] +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldrd r12, r2, [r0, #16] ; CHECK-NEXT: ldr.w r3, [r3, r10, lsl #2] -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: add.w r6, r6, r2, lsl #2 -; CHECK-NEXT: add.w r12, r12, r1, lsl #2 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add.w r2, r1, r4, lsl #2 -; CHECK-NEXT: add.w r3, r7, r3, lsl #2 +; CHECK-NEXT: muls r1, r6, r1 +; CHECK-NEXT: ldr.w r2, [r2, r10, lsl #2] +; CHECK-NEXT: ldrd r7, r5, [r0, #32] +; CHECK-NEXT: add.w r5, r5, r3, lsl #2 +; CHECK-NEXT: ldr.w r4, [r12, r10, lsl #2] +; CHECK-NEXT: add.w r3, r7, r2, lsl #2 +; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: add.w r2, r2, r1, lsl #2 +; CHECK-NEXT: add.w r12, lr, r4, lsl #2 ; CHECK-NEXT: add.w r1, r2, r11, lsl #2 +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: add.w r8, r1, r11, lsl #2 ; CHECK-NEXT: add.w r9, r8, r11, lsl #2 ; CHECK-NEXT: .LBB7_7: @ Parent Loop BB7_3 Depth=1 @@ -1145,7 +1153,7 @@ ; CHECK-NEXT: vsub.f32 q4, q3, q0 ; CHECK-NEXT: vadd.f32 q0, q3, q0 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: vldrw.u32 q0, [r6], #16 +; CHECK-NEXT: vldrw.u32 q0, [r3], #16 ; CHECK-NEXT: vcmul.f32 q3, q0, q4, #0 ; CHECK-NEXT: vcmla.f32 q3, q0, q4, #90 ; CHECK-NEXT: vstrb.8 q3, [r1], #16 @@ -1153,15 +1161,15 @@ ; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0 ; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90 ; CHECK-NEXT: vstrb.8 q3, [r8], #16 -; CHECK-NEXT: vldrw.u32 q0, [r3], #16 +; CHECK-NEXT: vldrw.u32 q0, [r5], #16 ; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0 ; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 ; CHECK-NEXT: vstrb.8 q2, [r9], #16 ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: @ %bb.8: @ in Loop: Header=BB7_6 Depth=2 ; CHECK-NEXT: ldr r3, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: adds r5, #1 -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: adds r6, #1 +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: bne .LBB7_6 ; CHECK-NEXT: b .LBB7_2 ; CHECK-NEXT: .LBB7_9: diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll @@ -14,9 +14,9 @@ ; CHECK-NEXT: add.w r1, r3, r1, lsl #2 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, r3, r1, lsr #2 +; CHECK-NEXT: add.w r1, r3, r1, lsr #2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0] @@ -71,9 +71,9 @@ ; CHECK-NEXT: add.w r1, r3, r1, lsl #3 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, r3, r1, lsr #3 +; CHECK-NEXT: add.w r1, r3, r1, lsr #3 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0] @@ -128,9 +128,9 @@ ; CHECK-NEXT: add.w r1, r3, r1, lsl #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, r3, r1, lsr #4 +; CHECK-NEXT: add.w r1, r3, r1, lsr #4 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q1, [r0] @@ -184,10 +184,10 @@ ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: add.w lr, r2, r1, lsr #2 +; CHECK-NEXT: add.w r2, r2, r1, lsr #2 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: eor r2, r1, #-2147483648 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -241,13 +241,13 @@ ; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3 +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vneg.f16 s0, s0 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r1, lsr #3 +; CHECK-NEXT: add.w r3, r3, r1, lsr #3 ; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0] @@ -307,9 +307,9 @@ ; CHECK-NEXT: add.w r1, r3, r1, lsl #2 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, r3, r1, lsr #2 +; CHECK-NEXT: add.w r1, r3, r1, lsr #2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0] @@ -364,9 +364,9 @@ ; CHECK-NEXT: add.w r1, r3, r1, lsl #3 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, r3, r1, lsr #3 +; CHECK-NEXT: add.w r1, r3, r1, lsr #3 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0] @@ -421,9 +421,9 @@ ; CHECK-NEXT: add.w r1, r3, r1, lsl #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, r3, r1, lsr #4 +; CHECK-NEXT: add.w r1, r3, r1, lsr #4 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q1, [r0] @@ -477,10 +477,10 @@ ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: add.w lr, r2, r1, lsr #2 +; CHECK-NEXT: add.w r2, r2, r1, lsr #2 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: eor r2, r1, #-2147483648 ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -534,13 +534,13 @@ ; CHECK-NEXT: .LBB9_1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3 +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vneg.f16 s0, s0 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r1, lsr #3 +; CHECK-NEXT: add.w r3, r3, r1, lsr #3 ; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q1, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll @@ -9,21 +9,32 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: vidup.u32 q2, r6, #1 +; CHECK-NEXT: cmp r1, #4 +; CHECK-NEXT: it ge +; CHECK-NEXT: movge.w r12, #4 +; CHECK-NEXT: sub.w r6, r1, r12 +; CHECK-NEXT: adds r6, #3 +; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: adr r4, .LCPI0_0 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: add.w r6, lr, r6, lsr #2 ; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: vmov.i32 q3, #0x4 ; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [r0], #16 -; CHECK-NEXT: vptt.f32 ge, q1, q4 +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vpstttt +; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 +; CHECK-NEXT: vcmpt.f32 ge, q1, q4 ; CHECK-NEXT: vmovt q1, q4 ; CHECK-NEXT: vmovt q0, q2 ; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: vldr s8, .LCPI0_1 ; CHECK-NEXT: vdup.32 q3, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -27,17 +27,17 @@ ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: bic r3, r3, #1 ; CHECK-NEXT: subs r7, r3, #2 -; CHECK-NEXT: adr r4, .LCPI0_0 ; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: adr r4, .LCPI0_0 +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: add.w r7, r6, r7, lsr #1 ; CHECK-NEXT: add.w r11, r2, r3, lsl #2 -; CHECK-NEXT: add.w lr, r6, r7, lsr #1 ; CHECK-NEXT: add.w r9, r1, r3, lsl #2 ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: vmvn.i32 q1, #0x80000000 ; CHECK-NEXT: mov.w r10, #-1 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrd r4, r5, [r0] @@ -108,11 +108,11 @@ ; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: beq .LBB0_8 ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader -; CHECK-NEXT: sub.w lr, r3, r7 -; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: subs r0, r3, r7 ; CHECK-NEXT: mov.w r1, #-2147483648 +; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: mov.w r0, #-1 ; CHECK-NEXT: mvn r2, #-2147483648 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r3, [r12], #4 @@ -248,16 +248,16 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: subs r2, r3, #4 ; CHECK-NEXT: movs r7, #1 +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-NEXT: add.w r11, r8, r3, lsl #2 +; CHECK-NEXT: add.w r7, r7, r2, lsr #2 ; CHECK-NEXT: add.w r10, r1, r3, lsl #2 -; CHECK-NEXT: add.w lr, r7, r2, lsr #2 +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: adr r7, .LCPI1_0 ; CHECK-NEXT: vldrw.u32 q0, [r7] ; CHECK-NEXT: adr r7, .LCPI1_1 ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: vldrw.u32 q1, [r7] -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-NEXT: mov.w r3, #-1 ; CHECK-NEXT: mvn r9, #-2147483648 ; CHECK-NEXT: .LBB1_4: @ %vector.body @@ -395,11 +395,11 @@ ; CHECK-NEXT: cmp r2, r3 ; CHECK-NEXT: beq .LBB1_8 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r2 -; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: subs r0, r3, r2 ; CHECK-NEXT: mov.w r1, #-2147483648 +; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: mov.w r0, #-1 ; CHECK-NEXT: mvn r3, #-2147483648 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r2, [r12], #4 @@ -536,15 +536,15 @@ ; CHECK-NEXT: vldrw.u32 q2, [r4] ; CHECK-NEXT: adr r4, .LCPI2_2 ; CHECK-NEXT: mov.w r9, #0 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: add.w r7, r6, r7, lsr #2 ; CHECK-NEXT: adr r6, .LCPI2_0 +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: subs r7, r3, #1 ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: vldrw.u32 q3, [r4] ; CHECK-NEXT: vdup.32 q1, r7 ; CHECK-NEXT: mov.w r12, #-1 ; CHECK-NEXT: mvn r8, #-2147483648 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -769,12 +769,12 @@ ; CHECK-NEXT: bic r5, r3, #1 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: subs r7, r5, #2 +; CHECK-NEXT: str r5, [sp] @ 4-byte Spill ; CHECK-NEXT: add.w r8, r2, r5, lsl #2 ; CHECK-NEXT: add.w r11, r1, r5, lsl #2 -; CHECK-NEXT: add.w lr, r6, r7, lsr #1 +; CHECK-NEXT: add.w r4, r6, r7, lsr #1 ; CHECK-NEXT: add.w r12, r0, r5, lsl #2 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: str r5, [sp] @ 4-byte Spill +; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrd r4, r9, [r0] @@ -816,8 +816,8 @@ ; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: .LBB3_6: @ %for.body.preheader -; CHECK-NEXT: sub.w lr, r3, r7 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r0, r3, r7 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB3_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r0, [r12], #4 @@ -929,9 +929,9 @@ ; CHECK-NEXT: sub.w r7, r8, #4 ; CHECK-NEXT: add.w r10, r2, r8, lsl #2 ; CHECK-NEXT: add.w r9, r1, r8, lsl #2 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: add.w r4, r6, r7, lsr #2 ; CHECK-NEXT: add.w r12, r0, r8, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: .LBB4_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -1007,8 +1007,8 @@ ; CHECK-NEXT: cmp r8, r3 ; CHECK-NEXT: beq .LBB4_8 ; CHECK-NEXT: .LBB4_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r8 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: sub.w r0, r3, r8 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB4_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r0, [r12], #4 @@ -1138,8 +1138,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r5 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r0, r3, r5 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r0, [r12], #2 @@ -1271,8 +1271,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB6_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r5 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r0, r3, r5 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB6_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r0, [r12], #2 @@ -1401,8 +1401,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r5 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r0, r3, r5 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r0, [r12], #2 @@ -1519,11 +1519,11 @@ ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: add.w lr, lr, r12, lsr #2 +; CHECK-NEXT: add.w r12, lr, r12, lsr #2 +; CHECK-NEXT: dls lr, r12 ; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vdup.32 q2, r3 @@ -1611,13 +1611,13 @@ ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: adr r4, .LCPI9_1 ; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: add.w lr, lr, r12, lsr #3 -; CHECK-NEXT: sub.w r12, r3, #1 +; CHECK-NEXT: add.w r12, lr, r12, lsr #3 ; CHECK-NEXT: vldrw.u32 q4, [r4] +; CHECK-NEXT: dls lr, r12 +; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB9_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -1785,13 +1785,13 @@ ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: adr r4, .LCPI10_1 ; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: add.w lr, lr, r12, lsr #3 -; CHECK-NEXT: sub.w r12, r3, #1 +; CHECK-NEXT: add.w r12, lr, r12, lsr #3 ; CHECK-NEXT: vldrw.u32 q4, [r4] +; CHECK-NEXT: dls lr, r12 +; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vdup.32 q6, r3 @@ -1938,9 +1938,9 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB11_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: subs r0, r3, r5 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: movw r0, #65535 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB11_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrh r1, [r12], #2 @@ -2072,9 +2072,9 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB12_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: subs r0, r3, r5 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: movw r0, #65535 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB12_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrh r1, [r12], #2 @@ -2208,8 +2208,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB13_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r5 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r0, r3, r5 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB13_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsb r0, [r12], #1 @@ -2335,8 +2335,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB14_6: @ %for.body.preheader23 -; CHECK-NEXT: sub.w lr, r3, r5 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r0, r3, r5 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB14_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsb r0, [r12], #1 @@ -2468,8 +2468,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB15_6: @ %for.body.preheader23 -; CHECK-NEXT: sub.w lr, r3, r5 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r0, r3, r5 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB15_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsb r0, [r12], #1 @@ -2598,8 +2598,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB16_6: @ %for.body.preheader23 -; CHECK-NEXT: sub.w lr, r3, r5 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r0, r3, r5 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB16_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsb r0, [r12], #1 @@ -2719,13 +2719,13 @@ ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: adr r4, .LCPI17_1 ; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: add.w lr, lr, r12, lsr #3 -; CHECK-NEXT: sub.w r12, r3, #1 +; CHECK-NEXT: add.w r12, lr, r12, lsr #3 ; CHECK-NEXT: vldrw.u32 q4, [r4] +; CHECK-NEXT: dls lr, r12 +; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB17_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vdup.32 q6, r3 @@ -2840,18 +2840,18 @@ ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: adr r4, .LCPI18_1 ; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: add.w lr, lr, r12, lsr #4 -; CHECK-NEXT: sub.w r12, r3, #1 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r12, lr, r12, lsr #4 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: dls lr, r12 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI18_2 +; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI18_3 -; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vldrw.u32 q6, [r4] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB18_2: @ %vector.body @@ -3140,18 +3140,18 @@ ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: adr r4, .LCPI19_1 ; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: add.w lr, lr, r12, lsr #4 -; CHECK-NEXT: sub.w r12, r3, #1 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r12, lr, r12, lsr #4 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: dls lr, r12 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI19_2 +; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI19_3 -; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vldrw.u32 q6, [r4] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB19_2: @ %vector.body @@ -3372,8 +3372,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB20_6: @ %for.body.preheader23 -; CHECK-NEXT: sub.w lr, r3, r5 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r0, r3, r5 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB20_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r0, [r12], #1 @@ -3507,8 +3507,8 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB21_6: @ %for.body.preheader23 -; CHECK-NEXT: sub.w lr, r3, r5 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r0, r3, r5 +; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB21_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r0, [r12], #1 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll @@ -163,10 +163,10 @@ ; CHECK-NEXT: .LBB3_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB3_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll b/llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll --- a/llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll @@ -9,8 +9,8 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -53,8 +53,8 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #8 @@ -97,8 +97,8 @@ ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #4 @@ -142,8 +142,8 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -187,8 +187,8 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #8 @@ -232,8 +232,8 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #4 @@ -277,8 +277,8 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -322,8 +322,8 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #8 @@ -367,8 +367,8 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -23,10 +23,10 @@ ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: subs r0, r3, #4 -; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: add.w r0, r2, r0, lsr #2 ; CHECK-NEXT: mov r2, r12 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: .LBB0_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16 @@ -37,13 +37,13 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB0_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 -; CHECK-NEXT: add.w r1, r12, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r1, r1, r3 +; CHECK-NEXT: add.w r2, r12, r3, lsl #2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB0_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r2, [r1], #4 -; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: ldr r1, [r2], #4 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: le lr, .LBB0_8 ; CHECK-NEXT: .LBB0_9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -113,9 +113,9 @@ ; CHECK-NEXT: bic r12, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x1 ; CHECK-NEXT: sub.w r3, r12, #4 -; CHECK-NEXT: add.w lr, r2, r3, lsr #2 +; CHECK-NEXT: add.w r2, r2, r3, lsr #2 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 @@ -132,9 +132,9 @@ ; CHECK-NEXT: mul r2, r2, lr ; CHECK-NEXT: beq .LBB1_8 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r12 +; CHECK-NEXT: sub.w r1, r1, r12 ; CHECK-NEXT: add.w r0, r0, r12, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB1_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r1, [r0], #4 @@ -213,9 +213,9 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: vmov.i8 q0, #0xff -; CHECK-NEXT: add.w lr, r2, r12, lsr #2 +; CHECK-NEXT: add.w r2, r2, r12, lsr #2 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 @@ -232,9 +232,9 @@ ; CHECK-NEXT: and.w r2, r2, r12 ; CHECK-NEXT: beq .LBB2_9 ; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 +; CHECK-NEXT: subs r1, r1, r3 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB2_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r1, [r0], #4 @@ -313,9 +313,9 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, r2, r12, lsr #2 +; CHECK-NEXT: add.w r2, r2, r12, lsr #2 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 @@ -332,9 +332,9 @@ ; CHECK-NEXT: orr.w r2, r2, r12 ; CHECK-NEXT: beq .LBB3_9 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 +; CHECK-NEXT: subs r1, r1, r3 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB3_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r1, [r0], #4 @@ -413,9 +413,9 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, r2, r12, lsr #2 +; CHECK-NEXT: add.w r2, r2, r12, lsr #2 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 @@ -432,9 +432,9 @@ ; CHECK-NEXT: eor.w r2, r2, r12 ; CHECK-NEXT: beq .LBB4_9 ; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 +; CHECK-NEXT: subs r1, r1, r3 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB4_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r1, [r0], #4 @@ -513,9 +513,9 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: sub.w r12, r2, #4 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r3], #16 @@ -528,9 +528,9 @@ ; CHECK-NEXT: vadd.f32 s0, s0, s4 ; CHECK-NEXT: beq .LBB5_9 ; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r2 +; CHECK-NEXT: subs r1, r1, r2 ; CHECK-NEXT: add.w r0, r0, r2, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB5_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr s2, [r0] @@ -614,9 +614,9 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: sub.w r12, r2, #4 ; CHECK-NEXT: vmov.f32 q0, #1.000000e+00 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r3], #16 @@ -629,9 +629,9 @@ ; CHECK-NEXT: vmul.f32 s0, s0, s4 ; CHECK-NEXT: beq .LBB6_9 ; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r2 +; CHECK-NEXT: subs r1, r1, r2 ; CHECK-NEXT: add.w r0, r0, r2, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB6_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr s2, [r0] @@ -711,9 +711,9 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: vmvn.i32 q0, #0x80000000 -; CHECK-NEXT: add.w lr, r2, r12, lsr #2 +; CHECK-NEXT: add.w r2, r2, r12, lsr #2 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 @@ -725,9 +725,9 @@ ; CHECK-NEXT: vminv.s32 r2, q0 ; CHECK-NEXT: beq .LBB7_9 ; CHECK-NEXT: .LBB7_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 +; CHECK-NEXT: subs r1, r1, r3 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB7_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r1, [r0], #4 @@ -809,10 +809,10 @@ ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: subs r0, r3, #4 -; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: mvn r0, #-2147483648 +; CHECK-NEXT: add.w r0, r2, r0, lsr #2 ; CHECK-NEXT: mov r2, r12 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: mvn r0, #-2147483648 ; CHECK-NEXT: .LBB8_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16 @@ -823,14 +823,14 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB8_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 -; CHECK-NEXT: add.w r1, r12, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r1, r1, r3 +; CHECK-NEXT: add.w r2, r12, r3, lsl #2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB8_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r2, [r1], #4 -; CHECK-NEXT: cmp r0, r2 -; CHECK-NEXT: csel r0, r0, r2, lt +; CHECK-NEXT: ldr r1, [r2], #4 +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: csel r0, r0, r1, lt ; CHECK-NEXT: le lr, .LBB8_8 ; CHECK-NEXT: .LBB8_9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -907,9 +907,9 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: vmov.i32 q0, #0x80000000 -; CHECK-NEXT: add.w lr, r2, r12, lsr #2 +; CHECK-NEXT: add.w r2, r2, r12, lsr #2 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 @@ -921,9 +921,9 @@ ; CHECK-NEXT: vmaxv.s32 r2, q0 ; CHECK-NEXT: beq .LBB9_9 ; CHECK-NEXT: .LBB9_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 +; CHECK-NEXT: subs r1, r1, r3 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB9_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r1, [r0], #4 @@ -1005,10 +1005,10 @@ ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: subs r0, r3, #4 -; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: mov.w r0, #-2147483648 +; CHECK-NEXT: add.w r0, r2, r0, lsr #2 ; CHECK-NEXT: mov r2, r12 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: mov.w r0, #-2147483648 ; CHECK-NEXT: .LBB10_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16 @@ -1019,14 +1019,14 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB10_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 -; CHECK-NEXT: add.w r1, r12, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r1, r1, r3 +; CHECK-NEXT: add.w r2, r12, r3, lsl #2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB10_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r2, [r1], #4 -; CHECK-NEXT: cmp r0, r2 -; CHECK-NEXT: csel r0, r0, r2, gt +; CHECK-NEXT: ldr r1, [r2], #4 +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: csel r0, r0, r1, gt ; CHECK-NEXT: le lr, .LBB10_8 ; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -1103,9 +1103,9 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: vmov.i8 q0, #0xff -; CHECK-NEXT: add.w lr, r2, r12, lsr #2 +; CHECK-NEXT: add.w r2, r2, r12, lsr #2 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB11_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 @@ -1117,9 +1117,9 @@ ; CHECK-NEXT: vminv.u32 r2, q0 ; CHECK-NEXT: beq .LBB11_9 ; CHECK-NEXT: .LBB11_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 +; CHECK-NEXT: subs r1, r1, r3 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB11_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r1, [r0], #4 @@ -1201,10 +1201,10 @@ ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: subs r0, r3, #4 -; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: add.w r0, r2, r0, lsr #2 ; CHECK-NEXT: mov r2, r12 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: mov.w r0, #-1 ; CHECK-NEXT: .LBB12_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16 @@ -1215,14 +1215,14 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB12_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 -; CHECK-NEXT: add.w r1, r12, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r1, r1, r3 +; CHECK-NEXT: add.w r2, r12, r3, lsl #2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB12_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r2, [r1], #4 -; CHECK-NEXT: cmp r0, r2 -; CHECK-NEXT: csel r0, r0, r2, hi +; CHECK-NEXT: ldr r1, [r2], #4 +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: csel r0, r0, r1, hi ; CHECK-NEXT: le lr, .LBB12_8 ; CHECK-NEXT: .LBB12_9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -1299,9 +1299,9 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, r2, r12, lsr #2 +; CHECK-NEXT: add.w r2, r2, r12, lsr #2 +; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB13_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 @@ -1313,9 +1313,9 @@ ; CHECK-NEXT: vmaxv.u32 r2, q0 ; CHECK-NEXT: beq .LBB13_9 ; CHECK-NEXT: .LBB13_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 +; CHECK-NEXT: subs r1, r1, r3 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB13_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r1, [r0], #4 @@ -1397,10 +1397,10 @@ ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: subs r0, r3, #4 -; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: add.w r0, r2, r0, lsr #2 ; CHECK-NEXT: mov r2, r12 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r0 +; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: .LBB14_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16 @@ -1411,14 +1411,14 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB14_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r3 -; CHECK-NEXT: add.w r1, r12, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: subs r1, r1, r3 +; CHECK-NEXT: add.w r2, r12, r3, lsl #2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB14_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r2, [r1], #4 -; CHECK-NEXT: cmp r0, r2 -; CHECK-NEXT: csel r0, r0, r2, hi +; CHECK-NEXT: ldr r1, [r2], #4 +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: csel r0, r0, r1, hi ; CHECK-NEXT: le lr, .LBB14_8 ; CHECK-NEXT: .LBB14_9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -1495,9 +1495,9 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: sub.w r12, r2, #4 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB15_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r3], #16 @@ -1511,9 +1511,9 @@ ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: beq .LBB15_9 ; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r2 +; CHECK-NEXT: subs r1, r1, r2 ; CHECK-NEXT: add.w r0, r0, r2, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB15_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldmia r0!, {s2} @@ -1600,9 +1600,9 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: sub.w r12, r2, #4 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB16_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r3], #16 @@ -1616,9 +1616,9 @@ ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: beq .LBB16_9 ; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r1, r2 +; CHECK-NEXT: subs r1, r1, r2 ; CHECK-NEXT: add.w r0, r0, r2, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .LBB16_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldmia r0!, {s2} @@ -1690,8 +1690,8 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cbz r1, .LBB17_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: dlstp.32 lr, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB17_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -1795,8 +1795,8 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cbz r1, .LBB19_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: dlstp.16 lr, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB19_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 @@ -1903,8 +1903,8 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cbz r1, .LBB21_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: dlstp.8 lr, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB21_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16 @@ -2011,8 +2011,8 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cbz r1, .LBB23_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: dlstp.16 lr, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB23_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 @@ -2116,8 +2116,8 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cbz r1, .LBB25_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: dlstp.8 lr, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB25_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16 @@ -2224,8 +2224,8 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cbz r1, .LBB27_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: dlstp.8 lr, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB27_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16 @@ -2329,9 +2329,9 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cbz r1, .LBB29_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB29_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -2383,8 +2383,8 @@ ; CHECK-NEXT: cbz r2, .LBB30_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: .LBB30_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -2442,8 +2442,8 @@ ; CHECK-NEXT: cbz r2, .LBB31_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: dlstp.16 lr, r2 +; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: .LBB31_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll @@ -20,32 +20,27 @@ ; CHECK-NEXT: bhi .LBB0_9 ; CHECK-NEXT: @ %bb.3: @ %vector.ph ; CHECK-NEXT: bic r4, r2, #7 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: sub.w r12, r4, #8 -; CHECK-NEXT: and r7, r2, #7 -; CHECK-NEXT: add.w r3, r3, r12, lsr #3 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: sub.w r3, r4, #8 ; CHECK-NEXT: add.w r12, r1, r4, lsl #1 -; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: add.w r5, r5, r3, lsr #3 ; CHECK-NEXT: add.w r3, r0, r4, lsl #2 +; CHECK-NEXT: dls lr, r5 +; CHECK-NEXT: and r5, r2, #7 ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] -; CHECK-NEXT: mov lr, r5 -; CHECK-NEXT: subs.w lr, lr, #1 ; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! -; CHECK-NEXT: mov r5, lr ; CHECK-NEXT: vmul.f16 q2, q0, q0 ; CHECK-NEXT: vfma.f16 q2, q1, q1 ; CHECK-NEXT: vstrb.8 q2, [r1], #16 -; CHECK-NEXT: bne .LBB0_4 -; CHECK-NEXT: b .LBB0_5 -; CHECK-NEXT: .LBB0_5: @ %middle.block +; CHECK-NEXT: le lr, .LBB0_4 +; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: mov lr, r7 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r7, pc} ; CHECK-NEXT: .LBB0_6: @ %while.body.preheader26 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r5 ; CHECK-NEXT: .LBB0_7: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r3] @@ -61,7 +56,7 @@ ; CHECK-NEXT: .LBB0_9: ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: mov lr, r2 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB0_6 entry: %cmp.not11 = icmp eq i32 %numSamples, 0 @@ -156,32 +151,27 @@ ; CHECK-NEXT: bhi .LBB1_9 ; CHECK-NEXT: @ %bb.3: @ %vector.ph ; CHECK-NEXT: bic r4, r2, #3 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: sub.w r12, r4, #4 -; CHECK-NEXT: and r7, r2, #3 -; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: subs r3, r4, #4 ; CHECK-NEXT: add.w r12, r1, r4, lsl #2 -; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: add.w r5, r5, r3, lsr #2 ; CHECK-NEXT: add.w r3, r0, r4, lsl #3 +; CHECK-NEXT: dls lr, r5 +; CHECK-NEXT: and r5, r2, #3 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] -; CHECK-NEXT: mov lr, r5 -; CHECK-NEXT: subs.w lr, lr, #1 ; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! -; CHECK-NEXT: mov r5, lr ; CHECK-NEXT: vmul.f32 q2, q0, q0 ; CHECK-NEXT: vfma.f32 q2, q1, q1 ; CHECK-NEXT: vstrb.8 q2, [r1], #16 -; CHECK-NEXT: bne .LBB1_4 -; CHECK-NEXT: b .LBB1_5 -; CHECK-NEXT: .LBB1_5: @ %middle.block +; CHECK-NEXT: le lr, .LBB1_4 +; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r4, r2 -; CHECK-NEXT: mov lr, r7 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r7, pc} ; CHECK-NEXT: .LBB1_6: @ %while.body.preheader26 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dls lr, r5 ; CHECK-NEXT: .LBB1_7: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr s0, [r3] @@ -197,7 +187,7 @@ ; CHECK-NEXT: .LBB1_9: ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: mov lr, r2 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB1_6 entry: %cmp.not11 = icmp eq i32 %numSamples, 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -20,8 +20,8 @@ ; CHECK-NEXT: and.w r3, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #3 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r3, r3, r12, lsr #3 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q5, [r0, #32] diff --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll @@ -186,8 +186,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: mov.w lr, #64 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: movs r3, #64 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16 @@ -231,8 +231,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: movs r3, #128 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 @@ -276,8 +276,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: mov.w r3, #256 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 diff --git a/llvm/test/Transforms/HardwareLoops/ARM/calls-codegen.ll b/llvm/test/Transforms/HardwareLoops/ARM/calls-codegen.ll --- a/llvm/test/Transforms/HardwareLoops/ARM/calls-codegen.ll +++ b/llvm/test/Transforms/HardwareLoops/ARM/calls-codegen.ll @@ -4,8 +4,8 @@ ; DISABLED-NOT: dls lr, ; CHECK-LABEL: test_target_specific: -; CHECK: mov.w lr, #50 -; CHECK: dls lr, lr +; CHECK: movs r2, #50 +; CHECK: dls lr, r2 ; CHECK-NOT: mov lr, ; CHECK: [[LOOP_HEADER:\.LBB[0-9_]+]]: ; CHECK: le lr, [[LOOP_HEADER]] @@ -31,8 +31,8 @@ } ; CHECK-LABEL: test_fabs: -; CHECK: mov.w lr, #100 -; CHECK: dls lr, lr +; CHECK: movs r1, #100 +; CHECK: dls lr, r1 ; CHECK-NOT: mov lr, ; CHECK: [[LOOP_HEADER:\.LBB[0-9_]+]]: ; CHECK-NOT: bl diff --git a/llvm/test/Transforms/HardwareLoops/ARM/calls.ll b/llvm/test/Transforms/HardwareLoops/ARM/calls.ll --- a/llvm/test/Transforms/HardwareLoops/ARM/calls.ll +++ b/llvm/test/Transforms/HardwareLoops/ARM/calls.ll @@ -8,7 +8,7 @@ ; DISABLED-NOT: call i32 @llvm.loop.decrement ; CHECK-LABEL: skip_call -; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-NOT: call i32 @llvm.start.loop.iterations ; CHECK-NOT: call i32 @llvm.loop.decrement define i32 @skip_call(i32 %n) { @@ -37,8 +37,8 @@ } ; CHECK-LABEL: test_target_specific -; CHECK: call void @llvm.set.loop.iterations.i32(i32 50) -; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ 50, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 50) +; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK: br i1 [[CMP]], label %loop, label %exit @@ -62,10 +62,10 @@ } ; CHECK-LABEL: test_fabs_f16 -; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations -; CHECK-MVE-NOT: call void @llvm.set.loop.iterations -; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations +; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations +; CHECK-FP: call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-MVEFP: call i32 @llvm.start.loop.iterations.i32(i32 100) define void @test_fabs_f16(half* %a, half* %b) { entry: br label %loop @@ -84,10 +84,10 @@ } ; CHECK-LABEL: test_fabs -; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations -; CHECK-MVE-NOT: call void @llvm.set.loop.iterations -; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations +; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations +; CHECK-FP: call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-MVEFP: call i32 @llvm.start.loop.iterations.i32(i32 100) define float @test_fabs(float* %a) { entry: @@ -107,11 +107,11 @@ } ; CHECK-LABEL: test_fabs_64 -; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations -; CHECK-MVE-NOT: call void @llvm.set.loop.iterations -; CHECK-FP-NOT: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-FP64: void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-MVEFP-NOT: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations +; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations +; CHECK-FP-NOT: call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-FP64: call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-MVEFP-NOT: call i32 @llvm.start.loop.iterations.i32(i32 100) define void @test_fabs_64(double* %a, double* %b) { entry: br label %loop @@ -130,9 +130,9 @@ } ; CHECK-LABEL: test_fabs_vec -; CHECK-MVE-NOT: call void @llvm.set.loop.iterations -; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] +; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations +; CHECK-MVEFP: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] ; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1) ; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit @@ -154,7 +154,7 @@ } ; CHECK-LABEL: test_log -; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-NOT: call i32 @llvm.start.loop.iterations ; CHECK-NOT: llvm.loop.decrement define float @test_log(float* %a) { entry: @@ -174,11 +174,11 @@ } ; CHECK-LABEL: test_sqrt_16 -; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations -; CHECK-MVE-NOT: call void @llvm.set.loop.iterations -; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-FP64: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations +; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations +; CHECK-FP: call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-MVEFP: call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-FP64: call i32 @llvm.start.loop.iterations.i32(i32 100) define void @test_sqrt_16(half* %a, half* %b) { entry: br label %loop @@ -196,11 +196,11 @@ ret void } ; CHECK-LABEL: test_sqrt -; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations -; CHECK-MVE-NOT: call void @llvm.set.loop.iterations -; CHECK-FP: call void @llvm.set.loop.iterations -; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] +; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations +; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations +; CHECK-FP: call i32 @llvm.start.loop.iterations +; CHECK-MVEFP: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] ; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1) ; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit @@ -222,11 +222,11 @@ } ; CHECK-LABEL: test_sqrt_64 -; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations -; CHECK-MVE-NOT: call void @llvm.set.loop.iterations -; CHECK-FP-NOT: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-MVEFP-NOT: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-FP64: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations +; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations +; CHECK-FP-NOT: call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-MVEFP-NOT: call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-FP64: call i32 @llvm.start.loop.iterations.i32(i32 100) define void @test_sqrt_64(double* %a, double* %b) { entry: br label %loop @@ -245,10 +245,10 @@ } ; CHECK-LABEL: test_sqrt_vec -; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations -; CHECK-MVE-NOT: call void @llvm.set.loop.iterations -; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations +; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations +; CHECK-FP: call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-MVEFP: call i32 @llvm.start.loop.iterations.i32(i32 100) define void @test_sqrt_vec(<4 x float>* %a, <4 x float>* %b) { entry: br label %loop @@ -267,7 +267,7 @@ } ; CHECK-LABEL: test_overflow -; CHECK: call void @llvm.set.loop.iterations +; CHECK: call i32 @llvm.start.loop.iterations define i32 @test_overflow(i32* %a, i32* %b) { entry: br label %loop @@ -289,7 +289,7 @@ ; TODO: We should be able to generate a qadd/sub ; CHECK-LABEL: test_sat -; CHECK: call void @llvm.set.loop.iterations.i32(i32 100) +; CHECK: call i32 @llvm.start.loop.iterations.i32(i32 100) define i32 @test_sat(i32* %a, i32* %b) { entry: br label %loop @@ -309,10 +309,10 @@ } ; CHECK-LABEL: test_masked_i32 -; CHECK-NOT: call void @llvm.set.loop.iterations -; CHECK-MVEFP: call void @llvm.set.loop.iterations -; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] +; CHECK-NOT: call i32 @llvm.start.loop.iterations +; CHECK-MVEFP: call i32 @llvm.start.loop.iterations +; CHECK-MVE: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1) ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit @@ -336,10 +336,10 @@ } ; CHECK-LABEL: test_masked_f32 -; CHECK-NOT: call void @llvm.set.loop.iterations -; CHECK-MVEFP: call void @llvm.set.loop.iterations -; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] +; CHECK-NOT: call i32 @llvm.start.loop.iterations +; CHECK-MVEFP: call i32 @llvm.start.loop.iterations +; CHECK-MVE: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1) ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit @@ -363,10 +363,10 @@ } ; CHECK-LABEL: test_gather_scatter -; CHECK-NOT: call void @llvm.set.loop.iterations -; CHECK-MVEFP: call void @llvm.set.loop.iterations -; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100) -; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] +; CHECK-NOT: call i32 @llvm.start.loop.iterations +; CHECK-MVEFP: call i32 @llvm.start.loop.iterations +; CHECK-MVE: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100) +; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ] ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1) ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit diff --git a/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll b/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll --- a/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll +++ b/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll @@ -2,17 +2,17 @@ ; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+soft-float -hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SOFT ; CHECK-LABEL: test_fptosi -; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations +; CHECK-SOFT-NOT: call i32 @llvm.start.loop.iterations ; CHECK: entry: ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 ; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 ; CHECK: while.body.lr.ph: -; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-FP: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]]) ; CHECK-FP-NEXT: br label %while.body -; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] +; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] ; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1) ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit @@ -59,15 +59,15 @@ ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 ; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 ; CHECK-FP: while.body.lr.ph: -; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-FP: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]]) ; CHECK-FP-NEXT: br label %while.body -; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] +; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] ; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1) ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit -; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations +; CHECK-SOFT-NOT: call i32 @llvm.start.loop.iterations define void @test_fptoui(i32 %n, i32** %g, double** %d) { entry: @@ -111,10 +111,10 @@ ; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 ; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 ; CHECK: while.body.lr.ph: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]]) ; CHECK-NEXT: br label %while.body -; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] +; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK: br i1 [[CMP]], label %while.body, label %cleanup.loopexit @@ -156,17 +156,17 @@ } ; CHECK-LABEL: fp_add -; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations +; CHECK-SOFT-NOT: call i32 @llvm.start.loop.iterations ; CHECK: entry: ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 ; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 ; CHECK: while.body.lr.ph: -; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-FP: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]]) ; CHECK: br label %while.body ; CHECK-SOFT-NOT: call i32 @llvm.loop.decrement -; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] +; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] ; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1) ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit diff --git a/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll b/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll --- a/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll +++ b/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll @@ -8,10 +8,10 @@ @g = common local_unnamed_addr global i32* null, align 4 ; CHECK-LABEL: do_copy -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n) +; CHECK: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 %n) ; CHECK: br label %while.body -; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %entry ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] +; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[START]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK: br i1 [[CMP]], label %while.body, label %while.end @@ -99,10 +99,10 @@ ; CHECK: [[COUNT:%[^ ]+]] = add nuw i32 [[HALVE]], 1 ; CHECK: while.body.lr.ph: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]]) ; CHECK: br label %while.body ; CHECK: while.body: -; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] +; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit @@ -152,10 +152,10 @@ ; CHECK: [[COUNT:%[^ ]+]] = add nuw i32 [[HALVE]], 1 ; CHECK: while.body.lr.ph: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]]) ; CHECK: br label %while.body -; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] +; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit diff --git a/llvm/test/Transforms/HardwareLoops/ARM/structure.ll b/llvm/test/Transforms/HardwareLoops/ARM/structure.ll --- a/llvm/test/Transforms/HardwareLoops/ARM/structure.ll +++ b/llvm/test/Transforms/HardwareLoops/ARM/structure.ll @@ -51,13 +51,13 @@ } ; CHECK-LABEL: nested -; CHECK-NOT: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK-NOT: call i32 @llvm.start.loop.iterations.i32(i32 %N) ; CHECK: br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 %N) ; CHECK: br label %while.body3.us -; CHECK: [[REM:%[^ ]+]] = phi i32 [ %N, %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ] +; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ] ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK: br i1 [[CMP]], label %while.body3.us, label %while.cond1.while.end_crit_edge.us @@ -103,19 +103,19 @@ } ; CHECK-LABEL: pre_existing -; CHECK: llvm.set.loop.iterations -; CHECK-NOT: llvm.set.loop.iterations +; CHECK: llvm.start.loop.iterations +; CHECK-NOT: llvm.start.loop.iterations ; CHECK: call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) ; CHECK-NOT: call i32 @llvm.loop.decrement.reg define i32 @pre_existing(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) { entry: - call void @llvm.set.loop.iterations.i32(i32 %n) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %n) br label %while.body while.body: ; preds = %while.body, %entry %q.addr.05 = phi i32* [ %incdec.ptr, %while.body ], [ %q, %entry ] %p.addr.04 = phi i32* [ %incdec.ptr1, %while.body ], [ %p, %entry ] - %0 = phi i32 [ %n, %entry ], [ %2, %while.body ] + %0 = phi i32 [ %start, %entry ], [ %2, %while.body ] %incdec.ptr = getelementptr inbounds i32, i32* %q.addr.05, i32 1 %1 = load i32, i32* %q.addr.05, align 4 %incdec.ptr1 = getelementptr inbounds i32, i32* %p.addr.04, i32 1 @@ -158,9 +158,9 @@ } ; CHECK-LABEL: pre_existing_inner -; CHECK-NOT: llvm.set.loop.iterations +; CHECK-NOT: llvm.start.loop.iterations ; CHECK: while.cond1.preheader.us: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: call i32 @llvm.start.loop.iterations.i32(i32 %N) ; CHECK: call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) ; CHECK: br i1 ; CHECK-NOT: call i32 @llvm.loop.decrement @@ -172,12 +172,12 @@ while.cond1.preheader.us: %i.021.us = phi i32 [ %inc6.us, %while.cond1.while.end_crit_edge.us ], [ 0, %entry ] %mul.us = mul i32 %i.021.us, %N - call void @llvm.set.loop.iterations.i32(i32 %N) + %start = call i32 @llvm.start.loop.iterations.i32(i32 %N) br label %while.body3.us while.body3.us: %j.019.us = phi i32 [ 0, %while.cond1.preheader.us ], [ %inc.us, %while.body3.us ] - %0 = phi i32 [ %N, %while.cond1.preheader.us ], [ %1, %while.body3.us ] + %0 = phi i32 [ %start, %while.cond1.preheader.us ], [ %1, %while.body3.us ] %add.us = add i32 %j.019.us, %mul.us %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us store i32 %add.us, i32* %arrayidx.us, align 4 @@ -196,7 +196,7 @@ } ; CHECK-LABEL: not_rotated -; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-NOT: call i32 @llvm.start.loop.iterations ; CHECK-NOT: call i32 @llvm.loop.decrement.i32 define void @not_rotated(i32, i16* nocapture, i16 signext) { br label %4 @@ -233,7 +233,7 @@ } ; CHECK-LABEL: multi_latch -; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-NOT: call i32 @llvm.start.loop.iterations ; CHECK-NOT: call i32 @llvm.loop.decrement define void @multi_latch(i32* %a, i32* %b, i32 %N) { entry: @@ -322,7 +322,7 @@ } ; CHECK-LABEL: unroll_inc_int -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: call i32 @llvm.start.loop.iterations.i32(i32 %N) ; CHECK: call i32 @llvm.loop.decrement.reg.i32( ; TODO: We should be able to support the unrolled loop body. @@ -404,7 +404,7 @@ } ; CHECK-LABEL: unroll_dec_int -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: call i32 @llvm.start.loop.iterations.i32(i32 %N) ; CHECK: call i32 @llvm.loop.decrement.reg.i32( ; TODO: An unnecessary register is being held to hold COUNT, lr should just @@ -420,7 +420,7 @@ ; CHECK-UNROLL-NEXT: [[PROLOGUE:.LBB[0-9_]+]]: ; CHECK-UNROLL: le lr, [[PROLOGUE]] ; CHECK-UNROLL-NEXT: [[PROLOGUE_EXIT:.LBB[0-9_]+]]: -; CHECK-UNROLL: dls lr, lr +; CHECK-UNROLL: dls lr, r5 ; CHECK-UNROLL: [[BODY:.LBB[0-9_]+]]: ; CHECK-UNROLL: le lr, [[BODY]] ; CHECK-UNROLL-NOT: b @@ -447,7 +447,7 @@ br i1 %cmp, label %for.body, label %for.cond.cleanup } -declare void @llvm.set.loop.iterations.i32(i32) #0 +declare i32 @llvm.start.loop.iterations.i32(i32) #0 declare i1 @llvm.test.set.loop.iterations.i32(i32) #0 declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #0 diff --git a/llvm/test/Transforms/HardwareLoops/loop-guards.ll b/llvm/test/Transforms/HardwareLoops/loop-guards.ll --- a/llvm/test/Transforms/HardwareLoops/loop-guards.ll +++ b/llvm/test/Transforms/HardwareLoops/loop-guards.ll @@ -11,7 +11,8 @@ ; CHECK: [[COUNT:%[^ ]+]] = add i32 [[MAX]], -1 ; CHECK: br i1 %t1, label %do.body.preheader ; CHECK: do.body.preheader: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-EXIT: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-LATCH: call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]]) ; CHECK: br label %do.body define void @test1(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { entry: @@ -36,6 +37,7 @@ ; CHECK-LABEL: test2 ; CHECK-NOT: call i1 @llvm.test.set.loop.iterations ; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-NOT: call i32 @llvm.start.loop.iterations define void @test2(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { entry: br i1 %t1, label %do.body, label %if.end @@ -62,7 +64,8 @@ ; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %N, i32 1 ; CHECK: br i1 %brmerge.demorgan, label %do.body.preheader ; CHECK: do.body.preheader: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-EXIT: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK-LATCH: call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]]) ; CHECK: br label %do.body define void @test3(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { entry: @@ -88,7 +91,7 @@ ; CHECK-LABEL: test4 ; CHECK: entry: ; CHECK-LATCH: br i1 %brmerge.demorgan, label %while.cond -; CHECK-LATCH-NOT: call void @llvm{{.*}}loop.iterations +; CHECK-LATCH-NOT: @llvm{{.*}}loop.iterations ; CHECK-EXIT: br i1 %brmerge.demorgan, label %while.cond.preheader ; CHECK-EXIT: while.cond.preheader: ; CHECK-EXIT: [[COUNT:%[^ ]+]] = add i32 %N, 1 @@ -122,7 +125,8 @@ ; CHECK: entry: ; CHECK: br i1 %or.cond, label %while.body.preheader ; CHECK: while.body.preheader: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK-EXIT: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK-LATCH: call i32 @llvm.start.loop.iterations.i32(i32 %N) ; CHECK: br label %while.body define void @test5(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { entry: @@ -221,7 +225,8 @@ ; CHECK: while.preheader: ; CHECK: br i1 %brmerge.demorgan, label %while.body.preheader ; CHECK: while.body.preheader: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK-EXIT: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK-LATCH: call i32 @llvm.start.loop.iterations.i32(i32 %N) ; CHECK: br label %while.body define void @test8(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { entry: @@ -252,7 +257,8 @@ ; CHECK: entry: ; CHECK: br i1 %brmerge.demorgan, label %do.body.preheader ; CHECK: do.body.preheader: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK-EXIT: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK-LATCH: call i32 @llvm.start.loop.iterations.i32(i32 %N) ; CHECK: br label %do.body define void @test9(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { entry: @@ -280,7 +286,8 @@ ; CHECK: entry: ; CHECK: br i1 %cmp.1, label %do.body.preheader ; CHECK: do.body.preheader: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 +; CHECK-EXIT: call void @llvm.set.loop.iterations.i32(i32 +; CHECK-LATCH: call i32 @llvm.start.loop.iterations.i32(i32 ; CHECK: br label %do.body define void @test10(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { entry: diff --git a/llvm/test/Transforms/HardwareLoops/scalar-while.ll b/llvm/test/Transforms/HardwareLoops/scalar-while.ll --- a/llvm/test/Transforms/HardwareLoops/scalar-while.ll +++ b/llvm/test/Transforms/HardwareLoops/scalar-while.ll @@ -30,17 +30,17 @@ ; CHECK-PHI-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]] ; CHECK-PHI: while.body.preheader: ; CHECK-PHI-NEXT: [[TMP0:%.*]] = sub i32 [[N]], [[I]] -; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]]) +; CHECK-PHI-NEXT: [[TMP1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP0]]) ; CHECK-PHI-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-PHI: while.body: ; CHECK-PHI-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-PHI-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ] +; CHECK-PHI-NEXT: [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ] ; CHECK-PHI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]] ; CHECK-PHI-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4 ; CHECK-PHI-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1 -; CHECK-PHI-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) -; CHECK-PHI-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-PHI-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]] +; CHECK-PHI-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1) +; CHECK-PHI-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 +; CHECK-PHI-NEXT: br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]] ; CHECK-PHI: while.end: ; CHECK-PHI-NEXT: ret void ; @@ -86,17 +86,17 @@ ; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = sub i32 [[N]], [[I]] ; CHECK-PHIGUARD-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]] ; CHECK-PHIGUARD: while.body.preheader: -; CHECK-PHIGUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]]) +; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP0]]) ; CHECK-PHIGUARD-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-PHIGUARD: while.body: ; CHECK-PHIGUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ] +; CHECK-PHIGUARD-NEXT: [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ] ; CHECK-PHIGUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]] ; CHECK-PHIGUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4 ; CHECK-PHIGUARD-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1 -; CHECK-PHIGUARD-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) -; CHECK-PHIGUARD-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-PHIGUARD-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]] +; CHECK-PHIGUARD-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1) +; CHECK-PHIGUARD-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 +; CHECK-PHIGUARD-NEXT: br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]] ; CHECK-PHIGUARD: while.end: ; CHECK-PHIGUARD-NEXT: ret void ; @@ -160,17 +160,17 @@ ; CHECK-PHI-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]] ; CHECK-PHI: while.body.preheader: ; CHECK-PHI-NEXT: [[TMP0:%.*]] = sub i32 [[I]], [[N]] -; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]]) +; CHECK-PHI-NEXT: [[TMP1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP0]]) ; CHECK-PHI-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-PHI: while.body: ; CHECK-PHI-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-PHI-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ] +; CHECK-PHI-NEXT: [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ] ; CHECK-PHI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]] ; CHECK-PHI-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4 ; CHECK-PHI-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1 -; CHECK-PHI-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) -; CHECK-PHI-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-PHI-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]] +; CHECK-PHI-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1) +; CHECK-PHI-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 +; CHECK-PHI-NEXT: br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]] ; CHECK-PHI: while.end: ; CHECK-PHI-NEXT: ret void ; @@ -216,17 +216,17 @@ ; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = sub i32 [[I]], [[N]] ; CHECK-PHIGUARD-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]] ; CHECK-PHIGUARD: while.body.preheader: -; CHECK-PHIGUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]]) +; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP0]]) ; CHECK-PHIGUARD-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-PHIGUARD: while.body: ; CHECK-PHIGUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ] +; CHECK-PHIGUARD-NEXT: [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ] ; CHECK-PHIGUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]] ; CHECK-PHIGUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4 ; CHECK-PHIGUARD-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1 -; CHECK-PHIGUARD-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) -; CHECK-PHIGUARD-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-PHIGUARD-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]] +; CHECK-PHIGUARD-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1) +; CHECK-PHIGUARD-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 +; CHECK-PHIGUARD-NEXT: br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]] ; CHECK-PHIGUARD: while.end: ; CHECK-PHIGUARD-NEXT: ret void ; @@ -292,17 +292,17 @@ ; CHECK-PHI: while.body.preheader: ; CHECK-PHI-NEXT: [[TMP0:%.*]] = add i32 [[I]], 1 ; CHECK-PHI-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]] -; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP1]]) +; CHECK-PHI-NEXT: [[TMP2:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP1]]) ; CHECK-PHI-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-PHI: while.body: ; CHECK-PHI-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-PHI-NEXT: [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ] +; CHECK-PHI-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP2]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP4:%.*]], [[WHILE_BODY]] ] ; CHECK-PHI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]] ; CHECK-PHI-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4 ; CHECK-PHI-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1 -; CHECK-PHI-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1) -; CHECK-PHI-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 -; CHECK-PHI-NEXT: br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]] +; CHECK-PHI-NEXT: [[TMP4]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1) +; CHECK-PHI-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP4]], 0 +; CHECK-PHI-NEXT: br i1 [[TMP5]], label [[WHILE_BODY]], label [[WHILE_END]] ; CHECK-PHI: while.end: ; CHECK-PHI-NEXT: ret void ; @@ -351,17 +351,17 @@ ; CHECK-PHIGUARD: while.body.preheader: ; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = add i32 [[I]], 1 ; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]] -; CHECK-PHIGUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP1]]) +; CHECK-PHIGUARD-NEXT: [[TMP2:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP1]]) ; CHECK-PHIGUARD-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-PHIGUARD: while.body: ; CHECK-PHIGUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-PHIGUARD-NEXT: [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ] +; CHECK-PHIGUARD-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP2]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP4:%.*]], [[WHILE_BODY]] ] ; CHECK-PHIGUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]] ; CHECK-PHIGUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4 ; CHECK-PHIGUARD-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1 -; CHECK-PHIGUARD-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1) -; CHECK-PHIGUARD-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 -; CHECK-PHIGUARD-NEXT: br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]] +; CHECK-PHIGUARD-NEXT: [[TMP4]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1) +; CHECK-PHIGUARD-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP4]], 0 +; CHECK-PHIGUARD-NEXT: br i1 [[TMP5]], label [[WHILE_BODY]], label [[WHILE_END]] ; CHECK-PHIGUARD: while.end: ; CHECK-PHIGUARD-NEXT: ret void ; @@ -424,17 +424,17 @@ ; CHECK-PHI-NEXT: [[CMP:%.*]] = icmp ne i32 [[N:%.*]], 0 ; CHECK-PHI-NEXT: br i1 [[CMP]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]] ; CHECK-PHI: while.body.preheader: -; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]]) +; CHECK-PHI-NEXT: [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]]) ; CHECK-PHI-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-PHI: while.body: ; CHECK-PHI-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] -; CHECK-PHI-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP1:%.*]], [[WHILE_BODY]] ] +; CHECK-PHI-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ] ; CHECK-PHI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]] ; CHECK-PHI-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4 ; CHECK-PHI-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1 -; CHECK-PHI-NEXT: [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) -; CHECK-PHI-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-PHI-NEXT: br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]] +; CHECK-PHI-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) +; CHECK-PHI-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-PHI-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]] ; CHECK-PHI: while.end: ; CHECK-PHI-NEXT: ret void ; @@ -548,17 +548,17 @@ ; CHECK-PHI-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0 ; CHECK-PHI-NEXT: br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK-PHI: while.body.preheader: -; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]]) +; CHECK-PHI-NEXT: [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]]) ; CHECK-PHI-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-PHI: while.body: ; CHECK-PHI-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] -; CHECK-PHI-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP1:%.*]], [[WHILE_BODY]] ] +; CHECK-PHI-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ] ; CHECK-PHI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]] ; CHECK-PHI-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4 ; CHECK-PHI-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1 -; CHECK-PHI-NEXT: [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) -; CHECK-PHI-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-PHI-NEXT: br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]] +; CHECK-PHI-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) +; CHECK-PHI-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-PHI-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]] ; CHECK-PHI: while.end: ; CHECK-PHI-NEXT: ret void ; @@ -676,17 +676,17 @@ ; CHECK-PHI-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0 ; CHECK-PHI-NEXT: br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK-PHI: while.body.preheader: -; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]]) +; CHECK-PHI-NEXT: [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]]) ; CHECK-PHI-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-PHI: while.body: ; CHECK-PHI-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] -; CHECK-PHI-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP1:%.*]], [[WHILE_BODY]] ] +; CHECK-PHI-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ] ; CHECK-PHI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]] ; CHECK-PHI-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4 ; CHECK-PHI-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1 -; CHECK-PHI-NEXT: [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) -; CHECK-PHI-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-PHI-NEXT: br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]] +; CHECK-PHI-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) +; CHECK-PHI-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-PHI-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]] ; CHECK-PHI: while.end: ; CHECK-PHI-NEXT: ret void ; @@ -820,18 +820,18 @@ ; CHECK-PHI: while.cond1.preheader.us: ; CHECK-PHI-NEXT: [[I_021_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[WHILE_COND1_WHILE_END_CRIT_EDGE_US:%.*]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-PHI-NEXT: [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]] -; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]]) +; CHECK-PHI-NEXT: [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]]) ; CHECK-PHI-NEXT: br label [[WHILE_BODY3_US:%.*]] ; CHECK-PHI: while.body3.us: ; CHECK-PHI-NEXT: [[J_019_US:%.*]] = phi i32 [ 0, [[WHILE_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[WHILE_BODY3_US]] ] -; CHECK-PHI-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_COND1_PREHEADER_US]] ], [ [[TMP1:%.*]], [[WHILE_BODY3_US]] ] +; CHECK-PHI-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[WHILE_BODY3_US]] ] ; CHECK-PHI-NEXT: [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]] ; CHECK-PHI-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[ADD_US]] ; CHECK-PHI-NEXT: store i32 [[ADD_US]], i32* [[ARRAYIDX_US]], align 4 ; CHECK-PHI-NEXT: [[INC_US]] = add nuw i32 [[J_019_US]], 1 -; CHECK-PHI-NEXT: [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) -; CHECK-PHI-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-PHI-NEXT: br i1 [[TMP2]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]] +; CHECK-PHI-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) +; CHECK-PHI-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-PHI-NEXT: br i1 [[TMP3]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]] ; CHECK-PHI: while.cond1.while.end_crit_edge.us: ; CHECK-PHI-NEXT: [[INC6_US]] = add nuw i32 [[I_021_US]], 1 ; CHECK-PHI-NEXT: [[EXITCOND23:%.*]] = icmp eq i32 [[INC6_US]], [[N]] @@ -897,18 +897,18 @@ ; CHECK-PHIGUARD: while.cond1.preheader.us: ; CHECK-PHIGUARD-NEXT: [[I_021_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[WHILE_COND1_WHILE_END_CRIT_EDGE_US:%.*]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-PHIGUARD-NEXT: [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]] -; CHECK-PHIGUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]]) +; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]]) ; CHECK-PHIGUARD-NEXT: br label [[WHILE_BODY3_US:%.*]] ; CHECK-PHIGUARD: while.body3.us: ; CHECK-PHIGUARD-NEXT: [[J_019_US:%.*]] = phi i32 [ 0, [[WHILE_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[WHILE_BODY3_US]] ] -; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_COND1_PREHEADER_US]] ], [ [[TMP1:%.*]], [[WHILE_BODY3_US]] ] +; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[WHILE_BODY3_US]] ] ; CHECK-PHIGUARD-NEXT: [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]] ; CHECK-PHIGUARD-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[ADD_US]] ; CHECK-PHIGUARD-NEXT: store i32 [[ADD_US]], i32* [[ARRAYIDX_US]], align 4 ; CHECK-PHIGUARD-NEXT: [[INC_US]] = add nuw i32 [[J_019_US]], 1 -; CHECK-PHIGUARD-NEXT: [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) -; CHECK-PHIGUARD-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-PHIGUARD-NEXT: br i1 [[TMP2]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]] +; CHECK-PHIGUARD-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1) +; CHECK-PHIGUARD-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-PHIGUARD-NEXT: br i1 [[TMP3]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]] ; CHECK-PHIGUARD: while.cond1.while.end_crit_edge.us: ; CHECK-PHIGUARD-NEXT: [[INC6_US]] = add nuw i32 [[I_021_US]], 1 ; CHECK-PHIGUARD-NEXT: [[EXITCOND23:%.*]] = icmp eq i32 [[INC6_US]], [[N]]