diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h --- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -119,13 +119,6 @@ /// use or a live out. bool isRegUsedAfter(MachineInstr *MI, int PhysReg); - /// Provides the first instruction before MI that uses PhysReg - MachineInstr *getInstWithUseBefore(MachineInstr *MI, int PhysReg); - - /// Provides all instructions before MI that uses PhysReg - void getAllInstWithUseBefore(MachineInstr *MI, int PhysReg, - SmallVectorImpl &Uses); - /// Provides the clearance - the number of instructions since the closest /// reaching def instuction of PhysReg that reaches MI. int getClearance(MachineInstr *MI, MCPhysReg PhysReg); @@ -133,11 +126,18 @@ /// Provides the uses, in the same block as MI, of register that MI defines. /// This does not consider live-outs. void getReachingLocalUses(MachineInstr *MI, int PhysReg, - SmallVectorImpl &Uses); - - /// Provide the number of uses, in the same block as MI, of the register that - /// MI defines. - unsigned getNumUses(MachineInstr *MI, int PhysReg); + SmallPtrSetImpl &Uses); + + /// For the given block, collect the instructions that use the live-in + /// value of the provided register. Return whether the value is still + /// live on exit. + bool getLiveInUses(MachineBasicBlock *MBB, int PhysReg, + SmallPtrSetImpl &Uses); + + /// Collect the users of the value stored in PhysReg, which is defined + /// by MI. + void getGlobalUses(MachineInstr *MI, int PhysReg, + SmallPtrSetImpl &Uses); private: /// Set up LiveRegs by merging predecessor live-out values. diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -225,7 +225,7 @@ } void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg, - SmallVectorImpl &Uses) { + SmallPtrSetImpl &Uses) { MachineBasicBlock *MBB = Def->getParent(); MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def); while (++MI != MBB->end()) { @@ -238,17 +238,54 @@ if (!MO.isReg() || !MO.isUse() || MO.getReg() != PhysReg) continue; - Uses.push_back(&*MI); + Uses.insert(&*MI); if (MO.isKill()) return; } } } -unsigned ReachingDefAnalysis::getNumUses(MachineInstr *Def, int PhysReg) { - SmallVector Uses; - getReachingLocalUses(Def, PhysReg, Uses); - return Uses.size(); +bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, int PhysReg, + SmallPtrSetImpl &Uses) { + for (auto &MI : *MBB) { + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isUse() || MO.getReg() != PhysReg) + continue; + if (getReachingDef(&MI, PhysReg) >= 0) + return false; + Uses.insert(&MI); + } + } + return isReachingDefLiveOut(&MBB->back(), PhysReg); +} + +void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, int PhysReg, + SmallPtrSetImpl &Uses) { + MachineBasicBlock *MBB = MI->getParent(); + + // Collect the uses that each def touches within the block. + getReachingLocalUses(MI, PhysReg, Uses); + + // Handle live-out values. + if (auto *LiveOut = getLocalLiveOutMIDef(MI->getParent(), PhysReg)) { + if (LiveOut != MI) + return; + + SmallVector ToVisit; + ToVisit.insert(ToVisit.begin(), MBB->successors().begin(), + MBB->successors().end()); + SmallPtrSetVisited; + while (!ToVisit.empty()) { + MachineBasicBlock *MBB = ToVisit.back(); + ToVisit.pop_back(); + if (Visited.count(MBB) || !MBB->isLiveIn(PhysReg)) + continue; + if (getLiveInUses(MBB, PhysReg, Uses)) + ToVisit.insert(ToVisit.end(), MBB->successors().begin(), + MBB->successors().end()); + Visited.insert(MBB); + } + } } bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) { @@ -305,28 +342,3 @@ return Def < 0 ? nullptr : getInstFromId(MBB, Def); } - -MachineInstr *ReachingDefAnalysis::getInstWithUseBefore(MachineInstr *MI, - int PhysReg) { - auto I = MachineBasicBlock::reverse_iterator(MI); - auto E = MI->getParent()->rend(); - I++; - - for ( ; I != E; I++) - for (auto &MO : I->operands()) - if (MO.isReg() && MO.isUse() && MO.getReg() == PhysReg) - return &*I; - - return nullptr; -} - -void ReachingDefAnalysis::getAllInstWithUseBefore(MachineInstr *MI, - int PhysReg, SmallVectorImpl &Uses) { - MachineInstr *Use = nullptr; - MachineInstr *Pos = MI; - - while ((Use = getInstWithUseBefore(Pos, PhysReg))) { - Uses.push_back(Use); - Pos = Use; - } -} diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -491,23 +491,24 @@ // This table shows the VPT instruction variants, i.e. the different // mask field encodings, see also B5.6. Predication/conditional execution in // the ArmARM. -enum VPTMaskValue { - T = 8, // 0b1000 - TT = 4, // 0b0100 - TE = 12, // 0b1100 - TTT = 2, // 0b0010 - TTE = 6, // 0b0110 - TEE = 10, // 0b1010 - TET = 14, // 0b1110 - TTTT = 1, // 0b0001 - TTTE = 3, // 0b0011 - TTEE = 5, // 0b0101 - TTET = 7, // 0b0111 - TEEE = 9, // 0b1001 - TEET = 11, // 0b1011 - TETT = 13, // 0b1101 - TETE = 15 // 0b1111 -}; + + +inline static unsigned getARMVPTBlockMask(unsigned NumInsts) { + switch (NumInsts) { + case 1: + return ARMVCC::T; + case 2: + return ARMVCC::TT; + case 3: + return ARMVCC::TTT; + case 4: + return ARMVCC::TTTT; + default: + break; + }; + llvm_unreachable("Unexpected number of instruction in a VPT block"); +} + static inline bool isVPTOpcode(int Opc) { return Opc == ARM::MVE_VPTv16i8 || Opc == ARM::MVE_VPTv16u8 || @@ -595,6 +596,18 @@ return 0; } +static inline unsigned getTailPredVectorWidth(unsigned Opcode) { + switch (Opcode) { + default: + llvm_unreachable("unhandled vctp opcode"); + case ARM::MVE_VCTP8: return 16; + case ARM::MVE_VCTP16: return 8; + case ARM::MVE_VCTP32: return 4; + case ARM::MVE_VCTP64: return 2; + } + return 0; +} + static inline bool isVCTP(MachineInstr *MI) { switch (MI->getOpcode()) { @@ -642,6 +655,17 @@ Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD; } +static inline bool isSubImmOpcode(int Opc) { + return Opc == ARM::SUBri || + Opc == ARM::tSUBi3 || Opc == ARM::tSUBi8 || + Opc == ARM::tSUBSi3 || Opc == ARM::tSUBSi8 || + Opc == ARM::t2SUBri || Opc == ARM::t2SUBri12 || Opc == ARM::t2SUBSri; +} + +static inline bool isMovRegOpcode(int Opc) { + return Opc == ARM::MOVr || Opc == ARM::tMOVr || Opc == ARM::t2MOVr; +} + /// isValidCoprocessorNumber - decide whether an explicit coprocessor /// number is legal in generic instructions like CDP. The answer can /// vary with the subtarget. diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -5360,6 +5360,7 @@ t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br, [(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>; +let hasSideEffects = 0 in def t2LoopDec : t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size), 4, IIC_Br, []>, Sched<[WriteBr]>; diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -45,6 +45,7 @@ #include "Thumb2InstrInfo.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineLoopUtils.h" @@ -121,9 +122,7 @@ public: PredicatedMI(MachineInstr *I, SetVector &Preds) : - MI(I) { - Predicates.insert(Preds.begin(), Preds.end()); - } + MI(I) { Predicates.insert(Preds.begin(), Preds.end()); } }; // Represent a VPT block, a list of instructions that begins with a VPST and @@ -186,6 +185,7 @@ VPTBlock *CurrentBlock = nullptr; SetVector CurrentPredicate; SmallVector VPTBlocks; + SmallPtrSet ToRemove; bool Revert = false; bool CannotTailPredicate = false; @@ -300,8 +300,6 @@ void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; - void RemoveLoopUpdate(LowOverheadLoop &LoLoop); - void ConvertVPTBlocks(LowOverheadLoop &LoLoop); MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); @@ -378,8 +376,45 @@ return true; } +static bool IsSafeToRemove(MachineInstr *MI, ReachingDefAnalysis *RDA, + SmallPtrSetImpl &Visited, + SmallPtrSetImpl &ToRemove, + SmallPtrSetImpl &Ignore) { + if (Visited.count(MI) || Ignore.count(MI)) + return true; + else if (MI->mayLoadOrStore() || MI->hasUnmodeledSideEffects() || + MI->isBranch() || MI->isTerminator() || MI->isReturn()) { + // Unless told to ignore the instruction, don't remove anything which has + // side effects. + LLVM_DEBUG(dbgs() << "ARM Loops: Has side effects: " << *MI); + return false; + } + + Visited.insert(MI); + for (auto &MO : MI->operands()) { + if (!MO.isReg() || MO.isUse() || MO.getReg() == 0) + continue; + + SmallPtrSet Uses; + RDA->getGlobalUses(MI, MO.getReg(), Uses); + + for (auto I : Uses) { + if (Ignore.count(I) || ToRemove.count(I)) + continue; + if (!IsSafeToRemove(I, RDA, Visited, ToRemove, Ignore)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove " << *I); + return false; + } + } + } + ToRemove.insert(MI); + LLVM_DEBUG(dbgs() << "ARM Loops: Can remove: " << *MI); + return true; +} + bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt, - ReachingDefAnalysis *RDA, MachineLoopInfo *MLI) { + ReachingDefAnalysis *RDA, + MachineLoopInfo *MLI) { assert(VCTP && "VCTP instruction expected but is not set"); // All predication within the loop should be based on vctp. If the block // isn't predicated on entry, check whether the vctp is within the block @@ -397,10 +432,9 @@ if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI)) continue; LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI - << " - which is predicated on:\n"; - for (auto *MI : PredMI.Predicates) - dbgs() << " - " << *MI; - ); + << " - which is predicated on:\n"; + for (auto *MI : PredMI.Predicates) + dbgs() << " - " << *MI); return false; } } @@ -422,17 +456,20 @@ // The element count register maybe defined after InsertPt, in which case we // need to try to move either InsertPt or the def so that the [w|d]lstp can // use the value. - MachineBasicBlock *InsertBB = InsertPt->getParent(); - if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) { + MachineBasicBlock *InsertBB = StartInsertPt->getParent(); + if (!RDA->isReachingDefLiveOut(StartInsertPt, NumElements)) { if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) { - if (IsSafeToMove(ElemDef, InsertPt, RDA)) { + if (IsSafeToMove( + ElemDef, StartInsertPt, RDA)) { ElemDef->removeFromParent(); - InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef); + InsertBB->insert(MachineBasicBlock::iterator(StartInsertPt), ElemDef); LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: " << *ElemDef); - } else if (IsSafeToMove(InsertPt, ElemDef, RDA)) { - InsertPt->removeFromParent(); - InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt); + } else if (IsSafeToMove( + StartInsertPt, ElemDef, RDA)) { + StartInsertPt->removeFromParent(); + InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), + StartInsertPt); LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); } else { LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop " @@ -474,7 +511,55 @@ MBB = *MBB->pred_begin(); } - LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n"); + // Check that the value change of the element count is what we expect and + // that the predication will be equivalent. For this we need: + // NumElements = NumElements - VectorWidth. The sub will be a sub immediate + // and we can also allow register copies within the chain too. + auto IsValidSub = [](MachineInstr *MI, unsigned ExpectedVecWidth) { + unsigned ImmOpIdx = 0; + switch (MI->getOpcode()) { + default: + llvm_unreachable("unhandled sub opcode"); + case ARM::tSUBi3: + case ARM::tSUBi8: + ImmOpIdx = 3; + break; + case ARM::t2SUBri: + case ARM::t2SUBri12: + ImmOpIdx = 2; + break; + } + return MI->getOperand(ImmOpIdx).getImm() == ExpectedVecWidth; + }; + + MBB = VCTP->getParent(); + if (MachineInstr *Def = RDA->getReachingMIDef(&MBB->back(), NumElements)) { + SmallPtrSet Visited; + SmallPtrSet ElementChain; + SmallPtrSet Ignore = { VCTP }; + unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode()); + + if (IsSafeToRemove(Def, RDA, Visited, ElementChain, Ignore)) { + bool FoundSub = false; + + for (auto *MI : ElementChain) { + if (isMovRegOpcode(MI->getOpcode())) + continue; + + if (isSubImmOpcode(MI->getOpcode())) { + if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) + return false; + FoundSub = true; + } else + return false; + } + + LLVM_DEBUG(dbgs() << "ARM Loops: Will remove element count chain:\n"; + for (auto *MI : ElementChain) + dbgs() << " - " << *MI); + ToRemove.insert(ElementChain.begin(), ElementChain.end()); + } + } return true; } @@ -646,6 +731,8 @@ dbgs() << " - " << Preheader->getName() << "\n"; else if (auto *Preheader = MLI->findLoopPreheader(ML)) dbgs() << " - " << Preheader->getName() << "\n"; + else if (auto *Preheader = MLI->findLoopPreheader(ML, true)) + dbgs() << " - " << Preheader->getName() << "\n"; for (auto *MBB : ML->getBlocks()) dbgs() << " - " << MBB->getName() << "\n"; ); @@ -696,28 +783,6 @@ // Check we know how to tail predicate any mve instructions. LoLoop.AnalyseMVEInst(&MI); } - - // We need to ensure that LR is not used or defined inbetween LoopDec and - // LoopEnd. - if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert) - continue; - - // If we find that LR has been written or read between LoopDec and - // LoopEnd, expect that the decremented value is being used else where. - // Because this value isn't actually going to be produced until the - // latch, by LE, we would need to generate a real sub. The value is also - // likely to be copied/reloaded for use of LoopEnd - in which in case - // we'd need to perform an add because it gets subtracted again by LE! - // The other option is to then generate the other form of LE which doesn't - // perform the sub. - for (auto &MO : MI.operands()) { - if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() && - MO.getReg() == ARM::LR) { - LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI); - LoLoop.Revert = true; - break; - } - } } } @@ -727,6 +792,19 @@ return false; } + SmallPtrSet Visited; + SmallPtrSet Ignore = { LoLoop.End }; + SmallPtrSet Remove; + if (!IsSafeToRemove(LoLoop.Dec, RDA, Visited, Remove, Ignore)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove loop count chain.\n"); + LoLoop.Revert = true; + } else { + LLVM_DEBUG(dbgs() << "ARM Loops: Will need to remove:\n"; + for (auto *I : Remove) + dbgs() << " - " << *I); + LoLoop.ToRemove.insert(Remove.begin(), Remove.end()); + } + LoLoop.CheckLegality(BBUtils.get(), RDA, MLI); Expand(LoLoop); return true; @@ -815,6 +893,37 @@ } MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { + LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n"); + // When using tail-predication, try to delete the dead code that was used to + // calculate the number of loop iterations. + if (LoLoop.IsTailPredicationLegal()) { + SmallVector Killed; + SmallVector Dead; + if (auto *Def = RDA->getReachingMIDef(LoLoop.Start, + LoLoop.Start->getOperand(0).getReg())) { + SmallPtrSet Visited; + SmallPtrSet Remove; + SmallPtrSet Ignore = { LoLoop.Start, LoLoop.Dec, + LoLoop.End, LoLoop.VCTP, + LoLoop.InsertPt }; + SmallVector Chain = { Def }; + while (!Chain.empty()) { + MachineInstr *MI = Chain.back(); + Chain.pop_back(); + if (IsSafeToRemove(MI, RDA, Visited, Remove, Ignore)) { + for (auto &MO : MI->operands()) { + if (!MO.isReg() || !MO.isUse() || MO.getReg() == 0) + continue; + if (auto *Op = RDA->getReachingMIDef(MI, MO.getReg())) + Chain.push_back(Op); + } + Ignore.insert(MI); + } + } + LoLoop.ToRemove.insert(Remove.begin(), Remove.end()); + } + } + MachineInstr *InsertPt = LoLoop.InsertPt; MachineInstr *Start = LoLoop.Start; MachineBasicBlock *MBB = InsertPt->getParent(); @@ -830,111 +939,14 @@ if (!IsDo) MIB.add(Start->getOperand(1)); - // When using tail-predication, try to delete the dead code that was used to - // calculate the number of loop iterations. - if (LoLoop.IsTailPredicationLegal()) { - SmallVector Killed; - SmallVector Dead; - if (auto *Def = RDA->getReachingMIDef(Start, - Start->getOperand(0).getReg())) { - Killed.push_back(Def); - - while (!Killed.empty()) { - MachineInstr *Def = Killed.back(); - Killed.pop_back(); - Dead.push_back(Def); - for (auto &MO : Def->operands()) { - if (!MO.isReg() || !MO.isKill()) - continue; - - MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg()); - if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1) - Killed.push_back(Kill); - } - } - for (auto *MI : Dead) - MI->eraseFromParent(); - } - } - // If we're inserting at a mov lr, then remove it as it's redundant. if (InsertPt != Start) - InsertPt->eraseFromParent(); - Start->eraseFromParent(); + LoLoop.ToRemove.insert(InsertPt); + LoLoop.ToRemove.insert(Start); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); return &*MIB; } -// Goal is to optimise and clean-up these loops: -// -// vector.body: -// renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg -// renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4 -// .. -// $lr = MVE_DLSTP_32 renamable $r3 -// -// The SUB is the old update of the loop iteration count expression, which -// is no longer needed. This sub is removed when the element count, which is in -// r3 in this example, is defined by an instruction in the loop, and it has -// no uses. -// -void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) { - Register ElemCount = LoLoop.VCTP->getOperand(1).getReg(); - MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back(); - - LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n"); - - if (LoLoop.ML->getNumBlocks() != 1) { - LLVM_DEBUG(dbgs() << "ARM Loops: Single block loop expected\n"); - return; - } - - LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing elemcount in operand: "; - LoLoop.VCTP->getOperand(1).dump()); - - // Find the definition we are interested in removing, if there is one. - MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount); - if (!Def) { - LLVM_DEBUG(dbgs() << "ARM Loops: Can't find a def, nothing to do.\n"); - return; - } - - // Bail if we define CPSR and it is not dead - if (!Def->registerDefIsDead(ARM::CPSR, TRI)) { - LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n"); - return; - } - - // Bail if elemcount is used in exit blocks, i.e. if it is live-in. - if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n"); - return; - } - - // Bail if there are uses after this Def in the block. - SmallVector Uses; - RDA->getReachingLocalUses(Def, ElemCount, Uses); - if (Uses.size()) { - LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n"); - return; - } - - Uses.clear(); - RDA->getAllInstWithUseBefore(Def, ElemCount, Uses); - - // Remove Def if there are no uses, or if the only use is the VCTP - // instruction. - if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: "; - Def->dump()); - Def->eraseFromParent(); - return; - } - - LLVM_DEBUG(dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n"; - for (auto U : Uses) U->dump()); -} - void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { auto RemovePredicate = [](MachineInstr *MI) { LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); @@ -996,7 +1008,7 @@ MIB.addImm(getARMVPTBlockMask(Size)); LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST()); LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); - Block.getVPST()->eraseFromParent(); + LoLoop.ToRemove.insert(Block.getVPST()); } } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { // A vpt block which is only predicated upon vctp and has no internal vpr @@ -1004,14 +1016,13 @@ // - Remove vpst. // - Unpredicate the remaining instructions. LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST()); - Block.getVPST()->eraseFromParent(); + LoLoop.ToRemove.insert(Block.getVPST()); for (auto &PredMI : Insts) RemovePredicate(PredMI.MI); } } - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP); - LoLoop.VCTP->eraseFromParent(); + LoLoop.ToRemove.insert(LoLoop.VCTP); } void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { @@ -1028,9 +1039,7 @@ MIB.add(End->getOperand(0)); MIB.add(End->getOperand(1)); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); - - LoLoop.End->eraseFromParent(); - LoLoop.Dec->eraseFromParent(); + End->eraseFromParent(); return &*MIB; }; @@ -1063,9 +1072,11 @@ RemoveDeadBranch(LoLoop.Start); LoLoop.End = ExpandLoopEnd(LoLoop); RemoveDeadBranch(LoLoop.End); - if (LoLoop.IsTailPredicationLegal()) { - RemoveLoopUpdate(LoLoop); + if (LoLoop.IsTailPredicationLegal()) ConvertVPTBlocks(LoLoop); + for (auto *I : LoLoop.ToRemove) { + LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); + I->eraseFromParent(); } } diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h --- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h +++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h @@ -111,22 +111,6 @@ }; } -inline static unsigned getARMVPTBlockMask(unsigned NumInsts) { - switch (NumInsts) { - case 1: - return ARMVCC::T; - case 2: - return ARMVCC::TT; - case 3: - return ARMVCC::TTT; - case 4: - return ARMVCC::TTTT; - default: - break; - }; - llvm_unreachable("Unexpected number of instruction in a VPT block"); -} - inline static const char *ARMVPTPredToString(ARMVCC::VPTCodes CC) { switch (CC) { case ARMVCC::None: return "none"; diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir @@ -1,12 +1,7 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s -# There are 2 SUBS, and the 2nd one is identified as the def. -# Thus, the 1st is a use, and we shouldn't optimise away the SUBS. - -# CHECK: bb.1.vector.body: -# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg -# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg -# CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 +# There are 2 SUBS, so don't use tail predication --- | target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" @@ -38,47 +33,26 @@ %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7) %9 = sub i32 %7, 4 - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 - %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef) %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8) %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) %12 = icmp ne i32 %11, 0 - br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 + br i1 %12, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) #1 - declare <4 x i1> @llvm.arm.vctp32(i32) #2 - declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 - declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 - declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4 - declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3 - declare void @llvm.stackprotector(i8*, i8**) #5 - - attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" } - attributes #1 = { noduplicate nounwind } - attributes #2 = { nounwind readnone } - attributes #3 = { argmemonly nounwind willreturn } - attributes #4 = { argmemonly nounwind readonly willreturn } - attributes #5 = { nounwind } - - !llvm.module.flags = !{!0, !1} - !llvm.ident = !{!2} - - !0 = !{i32 1, !"wchar_size", i32 4} - !1 = !{i32 1, !"min_enum_size", i32 4} - !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"} - !3 = !{!4, !4, i64 0} - !4 = !{!"int", !5, i64 0} - !5 = !{!"omnipotent char", !6, i64 0} - !6 = !{!"Simple C++ TBAA"} - !7 = distinct !{!7, !8} - !8 = !{!"llvm.loop.isvectorized", i32 1} + declare void @llvm.set.loop.iterations.i32(i32) + declare <4 x i1> @llvm.arm.vctp32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) ... --- @@ -128,11 +102,45 @@ constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: use_before_def + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) - liveins: $r0, $r1, $r2, $r3, $lr + liveins: $r0, $r1, $r2, $r3, $r7, $lr - frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 @@ -154,12 +162,12 @@ renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg MVE_VPST 4, implicit $vpr - renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3) - renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3) + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 MVE_VPST 8, implicit $vpr - renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3) + renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir @@ -1,19 +1,7 @@ -# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s - -# The CPSR is not dead: -# -# renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg -# -# We shouldn't optimise away the SUB. - -# CHECK: bb.1.vector.body: -# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg -# CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s --- | - target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" - target triple = "thumbv8.1m.main-arm-unknown-eabi" - define dso_local void @CPSR_not_dead(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { entry: %cmp8 = icmp sgt i32 %N, 0 @@ -30,57 +18,36 @@ br label %vector.body vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %5, %vector.ph ] %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] - %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] - %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %6 = phi i32 [ %N, %vector.ph ], [ %8, %vector.body ] %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* - %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7) - %9 = sub i32 %7, 4 - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 - %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 - %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3 + %7 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %6) + %8 = sub i32 %6, 4 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %7, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %7, <4 x i32> undef) + %9 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %9, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %7) %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 - %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) - %12 = icmp ne i32 %11, 0 - br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 + %10 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1) + %11 = icmp ne i32 %10, 0 + %lsr.iv.next = add nsw i32 %lsr.iv1, -1 + br i1 %11, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) #1 - declare <4 x i1> @llvm.arm.vctp32(i32) #2 - declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 - declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 - declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4 - declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3 - declare void @llvm.stackprotector(i8*, i8**) #5 - - attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" } - attributes #1 = { noduplicate nounwind } - attributes #2 = { nounwind readnone } - attributes #3 = { argmemonly nounwind willreturn } - attributes #4 = { argmemonly nounwind readonly willreturn } - attributes #5 = { nounwind } - - !llvm.module.flags = !{!0, !1} - !llvm.ident = !{!2} - - !0 = !{i32 1, !"wchar_size", i32 4} - !1 = !{i32 1, !"min_enum_size", i32 4} - !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"} - !3 = !{!4, !4, i64 0} - !4 = !{!"int", !5, i64 0} - !5 = !{!"omnipotent char", !6, i64 0} - !6 = !{!"Simple C++ TBAA"} - !7 = distinct !{!7, !8} - !8 = !{!"llvm.loop.isvectorized", i32 1} + declare void @llvm.set.loop.iterations.i32(i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) ... --- @@ -124,49 +91,79 @@ stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: CPSR_not_dead + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4 + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: $lr = MVE_DLSTP_32 renamable $r3 + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 + ; CHECK: renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1719, align 4) + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: t2IT 11, 8, implicit-def dead $itstate + ; CHECK: tPOP_RET 14, $noreg, def $r4, def $pc bb.0.entry: successors: %bb.1(0x80000000) - liveins: $r0, $r1, $r2, $r3, $lr + liveins: $r0, $r1, $r2, $r3, $r4, $lr - frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 - frame-setup CFI_INSTRUCTION offset $r7, -8 - $r7 = frame-setup tMOVr $sp, 14, $noreg - frame-setup CFI_INSTRUCTION def_cfa_register $r7 + frame-setup CFI_INSTRUCTION offset $r4, -8 tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate - tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + tPOP_RET 11, killed $cpsr, def $r4, def $pc, implicit killed $itstate renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg renamable $lr = t2MOVi 1, 14, $noreg, $noreg renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg - renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg - t2DoLoopStart renamable $lr + renamable $r4 = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $r4 + $r12 = tMOVr killed $r4, 14, $noreg bb.1.vector.body: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - liveins: $lr, $r0, $r1, $r2, $r3 + liveins: $r0, $r1, $r2, $r3, $r12 renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg - MVE_VPST 4, implicit $vpr - renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3) - renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3) + $lr = tMOVr $r12, 14, $noreg + renamable $r12 = nsw t2SUBri killed $r12, 1, 14, $noreg, $noreg renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11, killed $cpsr, def $r4, def $pc, implicit killed $itstate + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + renamable $lr = t2LoopDec killed renamable $lr, 1 renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 MVE_VPST 8, implicit $vpr - renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3) - renamable $lr = t2LoopDec killed renamable $lr, 1 - t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr tB %bb.2, 14, $noreg bb.2.for.cond.cleanup: - tPOP_RET 14, $noreg, def $r7, def $pc + t2IT 11, 8, implicit-def $itstate + tPOP_RET 14, $noreg, def $r4, def $pc ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -232,10 +232,10 @@ ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vldrwt.u32 q3, [r1], #16 -; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vfma.f32 q0, q3, q2 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block @@ -340,18 +340,17 @@ ; CHECK-NEXT: .LBB2_3: @ %else25 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmul.f16 q5, q6, q5 -; CHECK-NEXT: sub.w lr, lr, #1 +; CHECK-NEXT: adds r0, #8 ; CHECK-NEXT: vmovx.f16 s2, s21 ; CHECK-NEXT: vmovx.f16 s0, s20 ; CHECK-NEXT: vcvtb.f32.f16 s27, s2 -; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vcvtb.f32.f16 s26, s21 ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vcvtb.f32.f16 s25, s0 +; CHECK-NEXT: vcvtb.f32.f16 s26, s21 ; CHECK-NEXT: adds r3, #4 +; CHECK-NEXT: vcvtb.f32.f16 s25, s0 +; CHECK-NEXT: subs.w lr, lr, #1 ; CHECK-NEXT: vcvtb.f32.f16 s24, s20 ; CHECK-NEXT: vadd.f32 q5, q3, q6 -; CHECK-NEXT: cmp.w lr, #0 ; CHECK-NEXT: bne .LBB2_4 ; CHECK-NEXT: b .LBB2_21 ; CHECK-NEXT: .LBB2_4: @ %vector.body diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir @@ -0,0 +1,165 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s +--- | + define dso_local void @incorrect_sub_16(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { + entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i16* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i16* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv13 = bitcast i16* %lsr.iv to <8 x i16>* + %lsr.iv1416 = bitcast i16* %lsr.iv14 to <8 x i16>* + %lsr.iv1719 = bitcast i16* %lsr.iv17 to <8 x i16>* + %8 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7) + %9 = sub i32 %7, 7 + %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv13, i32 4, <8 x i1> %8, <8 x i16> undef) + %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv1416, i32 4, <8 x i1> %8, <8 x i16> undef) + %10 = add nsw <8 x i16> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %10, <8 x i16>* %lsr.iv1719, i32 4, <8 x i1> %8) + %scevgep = getelementptr i16, i16* %lsr.iv, i32 8 + %scevgep15 = getelementptr i16, i16* %lsr.iv14, i32 8 + %scevgep18 = getelementptr i16, i16* %lsr.iv17, i32 8 + %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) + %12 = icmp ne i32 %11, 0 + br i1 %12, label %vector.body, label %for.cond.cleanup + + for.cond.cleanup: ; preds = %vector.body, %entry + ret void + } + declare void @llvm.set.loop.iterations.i32(i32) + declare <8 x i1> @llvm.arm.mve.vctp16(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) + declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) +... +--- +name: incorrect_sub_16 +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: incorrect_sub_16 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7 + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRHU16_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 7, 14, $noreg + ; CHECK: renamable $q0 = nsw MVE_VADDi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0 = MVE_VSTRHU16_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + renamable $lr = t2MOVi 1, 14, $noreg, $noreg + renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + renamable $r2, renamable $q1 = MVE_VLDRHU16_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 7, 14, $noreg + renamable $q0 = nsw MVE_VADDi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + MVE_VPST 8, implicit $vpr + renamable $r0 = MVE_VSTRHU16_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.for.cond.cleanup: + tPOP_RET 14, $noreg, def $r7, def $pc + +... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir rename from llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir rename to llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s # Local use after def, this mov is using r3: @@ -6,15 +7,8 @@ # # We should optimise away the SUB -# CHECK: bb.1.vector.body: -# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg -# CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - --- | - target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" - target triple = "thumbv8.1m.main-arm-unknown-eabi" - - define dso_local void @local_use_after_def(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { + define dso_local void @incorrect_sub_32(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 @@ -38,53 +32,31 @@ %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* - %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7) - %9 = sub i32 %7, 4 - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 - %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %9 = sub i32 %7, 5 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef) %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8) %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) %12 = icmp ne i32 %11, 0 - br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 + br i1 %12, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) #1 - declare <4 x i1> @llvm.arm.vctp32(i32) #2 - declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 - declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 - declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4 - declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3 - declare void @llvm.stackprotector(i8*, i8**) #5 - - attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" } - attributes #1 = { noduplicate nounwind } - attributes #2 = { nounwind readnone } - attributes #3 = { argmemonly nounwind willreturn } - attributes #4 = { argmemonly nounwind readonly willreturn } - attributes #5 = { nounwind } - - !llvm.module.flags = !{!0, !1} - !llvm.ident = !{!2} - - !0 = !{i32 1, !"wchar_size", i32 4} - !1 = !{i32 1, !"min_enum_size", i32 4} - !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"} - !3 = !{!4, !4, i64 0} - !4 = !{!"int", !5, i64 0} - !5 = !{!"omnipotent char", !6, i64 0} - !6 = !{!"Simple C++ TBAA"} - !7 = distinct !{!7, !8} - !8 = !{!"llvm.loop.isvectorized", i32 1} + declare void @llvm.set.loop.iterations.i32(i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) ... --- -name: local_use_after_def +name: incorrect_sub_32 alignment: 2 exposesReturnsTwice: false legalized: false @@ -130,16 +102,45 @@ constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: incorrect_sub_32 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7 + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 5, 14, $noreg + ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) - liveins: $r0, $r1, $r2, $r3, $lr + liveins: $r0, $r1, $r2, $r3, $r7, $lr - frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 - $r7 = frame-setup tMOVr $sp, 14, $noreg - frame-setup CFI_INSTRUCTION def_cfa_register $r7 tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate @@ -156,14 +157,13 @@ renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg MVE_VPST 4, implicit $vpr - renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3) - renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3) - renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 5, 14, $noreg renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 MVE_VPST 8, implicit $vpr - renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3) + renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 - $r2 = tMOVr killed $r3, 14, $noreg t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr tB %bb.2, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir @@ -0,0 +1,166 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s +--- | + define dso_local void @incorrect_sub_8(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i8* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { + entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i8* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i8* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv13 = bitcast i8* %lsr.iv to <16 x i8>* + %lsr.iv1416 = bitcast i8* %lsr.iv14 to <16 x i8>* + %lsr.iv1719 = bitcast i8* %lsr.iv17 to <16 x i8>* + %8 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %7) + %9 = sub i32 %7, 15 + %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv13, i32 4, <16 x i1> %8, <16 x i8> undef) + %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv1416, i32 4, <16 x i1> %8, <16 x i8> undef) + %10 = add nsw <16 x i8> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %10, <16 x i8>* %lsr.iv1719, i32 4, <16 x i1> %8) + %scevgep = getelementptr i8, i8* %lsr.iv, i32 16 + %scevgep15 = getelementptr i8, i8* %lsr.iv14, i32 16 + %scevgep18 = getelementptr i8, i8* %lsr.iv17, i32 16 + %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) + %12 = icmp ne i32 %11, 0 + br i1 %12, label %vector.body, label %for.cond.cleanup + + for.cond.cleanup: ; preds = %vector.body, %entry + ret void + } + declare void @llvm.set.loop.iterations.i32(i32) + declare <16 x i1> @llvm.arm.mve.vctp8(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) + declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) + declare void @llvm.stackprotector(i8*, i8**) +... +--- +name: incorrect_sub_8 +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: incorrect_sub_8 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7 + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRBU8_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 15, 14, $noreg + ; CHECK: renamable $q0 = nsw MVE_VADDi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0 = MVE_VSTRBU8_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + renamable $lr = t2MOVi 1, 14, $noreg, $noreg + renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + renamable $r2, renamable $q1 = MVE_VLDRBU8_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 15, 14, $noreg + renamable $q0 = nsw MVE_VADDi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + MVE_VPST 8, implicit $vpr + renamable $r0 = MVE_VSTRBU8_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.for.cond.cleanup: + tPOP_RET 14, $noreg, def $r7, def $pc + +... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir @@ -111,15 +111,14 @@ ; CHECK: tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate ; CHECK: renamable $r12 = t2LSRri killed renamable $r3, 1, 14, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg - ; CHECK: $lr = MVE_DLSTP_32 renamable $r12 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12 ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12 + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg ; CHECK: renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1) ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg ; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg ; CHECK: renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1) ; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir @@ -118,16 +118,15 @@ ; CHECK: tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate ; CHECK: $r12 = t2MOVr killed $r3, 14, $noreg, $noreg ; CHECK: renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg - ; CHECK: $lr = MVE_DLSTP_32 renamable $r12 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12 ; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12 + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg ; CHECK: renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1) ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg ; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg ; CHECK: renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1) ; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir @@ -117,15 +117,14 @@ ; CHECK: $r12 = t2MOVr killed $r3, 14, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg ; CHECK: renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg - ; CHECK: $lr = MVE_DLSTP_32 renamable $r12 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12 ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12 + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg ; CHECK: renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1) ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg ; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg ; CHECK: renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1) ; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -90,8 +90,8 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vldrh.s32 q2, [r1], #8 ; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrh.s32 q2, [r1], #8 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block @@ -230,8 +230,8 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vldrh.u32 q2, [r1], #8 ; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrh.u32 q2, [r1], #8 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block @@ -295,8 +295,8 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block @@ -385,10 +385,9 @@ ; CHECK-NEXT: adds r5, r0, r4 ; CHECK-NEXT: vldrb.u32 q0, [r5] ; CHECK-NEXT: adds r5, r1, r4 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrb.u32 q1, [r5] ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB5_5 @@ -593,7 +592,6 @@ ; CHECK-NEXT: vldrh.s32 q0, [r0], #8 ; CHECK-NEXT: vldrh.s32 q1, [r1], #8 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB6_1 @@ -682,10 +680,9 @@ ; CHECK-NEXT: adds r5, r0, r4 ; CHECK-NEXT: vldrb.u32 q0, [r5] ; CHECK-NEXT: adds r5, r1, r4 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vldrb.u32 q1, [r5] ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB7_5 @@ -890,7 +887,6 @@ ; CHECK-NEXT: vldrh.u32 q0, [r0], #8 ; CHECK-NEXT: vldrh.u32 q1, [r1], #8 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB8_1 @@ -979,7 +975,6 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB9_5 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir @@ -0,0 +1,323 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s + +--- | + define dso_local arm_aapcs_vfpcc void @remove_mov_lr_chain(float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %blockSize) #0 { + entry: + %cmp5 = icmp eq i32 %blockSize, 0 + br i1 %cmp5, label %while.end, label %while.body.preheader + + while.body.preheader: ; preds = %entry + %min.iters.check = icmp ult i32 %blockSize, 4 + br i1 %min.iters.check, label %while.body.preheader19, label %vector.memcheck + + vector.memcheck: ; preds = %while.body.preheader + %scevgep = getelementptr float, float* %pDst, i32 %blockSize + %scevgep12 = getelementptr float, float* %pSrc, i32 %blockSize + %bound0 = icmp ugt float* %scevgep12, %pDst + %bound1 = icmp ugt float* %scevgep, %pSrc + %found.conflict = and i1 %bound0, %bound1 + %0 = lshr i32 %blockSize, 2 + %1 = shl nuw i32 %0, 2 + %2 = add i32 %1, -4 + %3 = lshr i32 %2, 2 + %4 = add nuw nsw i32 %3, 1 + br i1 %found.conflict, label %while.body.preheader19, label %vector.ph + + vector.ph: ; preds = %vector.memcheck + %n.vec = and i32 %blockSize, -4 + %ind.end = sub i32 %blockSize, %n.vec + %ind.end15 = getelementptr float, float* %pSrc, i32 %n.vec + %ind.end17 = getelementptr float, float* %pDst, i32 %n.vec + %scevgep9 = getelementptr float, float* %pDst, i32 -4 + %scevgep14 = getelementptr float, float* %pSrc, i32 -4 + call void @llvm.set.loop.iterations.i32(i32 %4) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv15 = phi float* [ %scevgep16, %vector.body ], [ %scevgep14, %vector.ph ] + %lsr.iv10 = phi float* [ %scevgep11, %vector.body ], [ %scevgep9, %vector.ph ] + %5 = phi i32 [ %4, %vector.ph ], [ %7, %vector.body ] + %lsr.iv1517 = bitcast float* %lsr.iv15 to <4 x float>* + %lsr.iv1012 = bitcast float* %lsr.iv10 to <4 x float>* + %scevgep18 = getelementptr <4 x float>, <4 x float>* %lsr.iv1517, i32 1 + %wide.load = load <4 x float>, <4 x float>* %scevgep18, align 4 + %6 = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %wide.load) + %scevgep13 = getelementptr <4 x float>, <4 x float>* %lsr.iv1012, i32 1 + store <4 x float> %6, <4 x float>* %scevgep13, align 4 + %scevgep11 = getelementptr float, float* %lsr.iv10, i32 4 + %scevgep16 = getelementptr float, float* %lsr.iv15, i32 4 + %7 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %5, i32 1) + %8 = icmp ne i32 %7, 0 + br i1 %8, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %cmp.n = icmp eq i32 %n.vec, %blockSize + br i1 %cmp.n, label %while.end, label %while.body.preheader19 + + while.body.preheader19: ; preds = %middle.block, %vector.memcheck, %while.body.preheader + %blkCnt.08.ph = phi i32 [ %blockSize, %vector.memcheck ], [ %blockSize, %while.body.preheader ], [ %ind.end, %middle.block ] + %pSrc.addr.07.ph = phi float* [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end15, %middle.block ] + %pDst.addr.06.ph = phi float* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end17, %middle.block ] + %scevgep1 = getelementptr float, float* %pSrc.addr.07.ph, i32 -1 + %scevgep4 = getelementptr float, float* %pDst.addr.06.ph, i32 -1 + call void @llvm.set.loop.iterations.i32(i32 %blkCnt.08.ph) + br label %while.body + + while.body: ; preds = %while.body, %while.body.preheader19 + %lsr.iv5 = phi float* [ %scevgep6, %while.body ], [ %scevgep4, %while.body.preheader19 ] + %lsr.iv = phi float* [ %scevgep2, %while.body ], [ %scevgep1, %while.body.preheader19 ] + %9 = phi i32 [ %blkCnt.08.ph, %while.body.preheader19 ], [ %12, %while.body ] + %scevgep3 = getelementptr float, float* %lsr.iv, i32 1 + %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1 + %10 = load float, float* %scevgep3, align 4 + %11 = tail call fast float @llvm.fabs.f32(float %10) + store float %11, float* %scevgep7, align 4 + %scevgep2 = getelementptr float, float* %lsr.iv, i32 1 + %scevgep6 = getelementptr float, float* %lsr.iv5, i32 1 + %12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1) + %13 = icmp ne i32 %12, 0 + br i1 %13, label %while.body, label %while.end + + while.end: ; preds = %while.body, %middle.block, %entry + ret void + } + declare float @llvm.fabs.f32(float) + declare <4 x float> @llvm.fabs.v4f32(<4 x float>) + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + +... +--- +name: remove_mov_lr_chain +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 16 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: remove_mov_lr_chain + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.9(0x30000000), %bb.1(0x50000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r4, $r5, $r7 + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 16 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -12 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -16 + ; CHECK: tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: tBcc %bb.9, 0, killed $cpsr + ; CHECK: bb.1.while.body.preheader: + ; CHECK: successors: %bb.6(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: tCMPi8 renamable $r2, 4, 14, $noreg, implicit-def $cpsr + ; CHECK: tBcc %bb.6, 3, killed $cpsr + ; CHECK: bb.2.vector.memcheck: + ; CHECK: successors: %bb.3(0x40000000), %bb.6(0x40000000) + ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: renamable $r3 = t2ADDrs renamable $r0, renamable $r2, 18, 14, $noreg, $noreg + ; CHECK: tCMPr killed renamable $r3, renamable $r1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 8, 4, implicit-def $itstate + ; CHECK: renamable $r3 = t2ADDrs renamable $r1, renamable $r2, 18, 8, $cpsr, $noreg, implicit $itstate + ; CHECK: tCMPr killed renamable $r3, renamable $r0, 8, killed $cpsr, implicit-def $cpsr, implicit killed $itstate + ; CHECK: tBcc %bb.6, 8, killed $cpsr + ; CHECK: bb.3.vector.ph: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: renamable $r4 = t2BICri renamable $r2, 3, 14, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: renamable $r12 = t2SUBri renamable $r4, 4, 14, $noreg, $noreg + ; CHECK: renamable $r7, dead $cpsr = tSUBrr renamable $r2, renamable $r4, 14, $noreg + ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2ADDrs renamable $r0, renamable $r4, 18, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS renamable $r3 + ; CHECK: renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 16, 14, $noreg + ; CHECK: dead $r5 = tMOVr killed $r3, 14, $noreg + ; CHECK: renamable $r3 = t2ADDrs renamable $r1, renamable $r4, 18, 14, $noreg, $noreg + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 16, 14, $noreg + ; CHECK: bb.4.vector.body: + ; CHECK: successors: %bb.4(0x7c000000), %bb.5(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r7, $r12 + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_pre killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.scevgep18, align 4) + ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VABSf32 killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r1 = MVE_VSTRBU8_pre killed renamable $q0, killed renamable $r1, 16, 0, $noreg :: (store 16 into %ir.scevgep13, align 4) + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.4 + ; CHECK: bb.5.middle.block: + ; CHECK: successors: %bb.7(0x80000000) + ; CHECK: liveins: $r2, $r3, $r4, $r7, $r12 + ; CHECK: tCMPr killed renamable $r4, killed renamable $r2, 14, $noreg, implicit-def $cpsr + ; CHECK: $lr = tMOVr killed $r7, 14, $noreg + ; CHECK: t2IT 0, 8, implicit-def $itstate + ; CHECK: tPOP_RET 0, killed $cpsr, def $r4, def $r5, def $r7, def $pc, implicit killed $itstate + ; CHECK: tB %bb.7, 14, $noreg + ; CHECK: bb.6: + ; CHECK: successors: %bb.7(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: $lr = tMOVr killed $r2, 14, $noreg + ; CHECK: $r12 = tMOVr killed $r0, 14, $noreg + ; CHECK: $r3 = tMOVr killed $r1, 14, $noreg + ; CHECK: bb.7.while.body.preheader19: + ; CHECK: successors: %bb.8(0x80000000) + ; CHECK: liveins: $lr, $r3, $r12 + ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r3, 4, 14, $noreg + ; CHECK: renamable $r1 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.8.while.body: + ; CHECK: successors: %bb.8(0x7c000000), %bb.9(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1 + ; CHECK: renamable $s0 = VLDRS renamable $r1, 1, 14, $noreg :: (load 4 from %ir.scevgep3) + ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 4, 14, $noreg + ; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VABSS killed renamable $s0, 14, $noreg + ; CHECK: VSTRS killed renamable $s0, renamable $r0, 1, 14, $noreg :: (store 4 into %ir.scevgep7) + ; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 4, 14, $noreg + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.8 + ; CHECK: bb.9.while.end: + ; CHECK: tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc + bb.0.entry: + successors: %bb.9(0x30000000), %bb.1(0x50000000) + liveins: $r0, $r1, $r2, $r4, $r5, $r7, $lr + + frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + frame-setup CFI_INSTRUCTION offset $r5, -12 + frame-setup CFI_INSTRUCTION offset $r4, -16 + tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + tBcc %bb.9, 0, killed $cpsr + + bb.1.while.body.preheader: + successors: %bb.6(0x40000000), %bb.2(0x40000000) + liveins: $r0, $r1, $r2 + + tCMPi8 renamable $r2, 4, 14, $noreg, implicit-def $cpsr + tBcc %bb.6, 3, killed $cpsr + + bb.2.vector.memcheck: + successors: %bb.3(0x40000000), %bb.6(0x40000000) + liveins: $r0, $r1, $r2 + + renamable $r3 = t2ADDrs renamable $r0, renamable $r2, 18, 14, $noreg, $noreg + tCMPr killed renamable $r3, renamable $r1, 14, $noreg, implicit-def $cpsr + t2IT 8, 4, implicit-def $itstate + renamable $r3 = t2ADDrs renamable $r1, renamable $r2, 18, 8, $cpsr, $noreg, implicit $itstate + tCMPr killed renamable $r3, renamable $r0, 8, killed $cpsr, implicit-def $cpsr, implicit killed $itstate + tBcc %bb.6, 8, killed $cpsr + + bb.3.vector.ph: + successors: %bb.4(0x80000000) + liveins: $r0, $r1, $r2 + + renamable $r4 = t2BICri renamable $r2, 3, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $r12 = t2SUBri renamable $r4, 4, 14, $noreg, $noreg + renamable $r7, dead $cpsr = tSUBrr renamable $r2, renamable $r4, 14, $noreg + renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + renamable $r12 = t2ADDrs renamable $r0, renamable $r4, 18, 14, $noreg, $noreg + t2DoLoopStart renamable $r3 + renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 16, 14, $noreg + $r5 = tMOVr killed $r3, 14, $noreg + renamable $r3 = t2ADDrs renamable $r1, renamable $r4, 18, 14, $noreg, $noreg + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 16, 14, $noreg + + bb.4.vector.body: + successors: %bb.4(0x7c000000), %bb.5(0x04000000) + liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $r12 + + renamable $r0, renamable $q0 = MVE_VLDRWU32_pre killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.scevgep18, align 4) + $lr = tMOVr killed $r5, 14, $noreg + renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VABSf32 killed renamable $q0, 0, $noreg, undef renamable $q0 + renamable $r1 = MVE_VSTRBU8_pre killed renamable $q0, killed renamable $r1, 16, 0, $noreg :: (store 16 into %ir.scevgep13, align 4) + renamable $lr = t2LoopDec killed renamable $lr, 1 + $r5 = tMOVr $lr, 14, $noreg + t2LoopEnd killed renamable $lr, %bb.4, implicit-def dead $cpsr + tB %bb.5, 14, $noreg + + bb.5.middle.block: + successors: %bb.7(0x80000000) + liveins: $r2, $r3, $r4, $r7, $r12 + + tCMPr killed renamable $r4, killed renamable $r2, 14, $noreg, implicit-def $cpsr + $lr = tMOVr killed $r7, 14, $noreg + t2IT 0, 8, implicit-def $itstate + tPOP_RET 0, killed $cpsr, def $r4, def $r5, def $r7, def $pc, implicit killed $itstate + tB %bb.7, 14, $noreg + + bb.6: + successors: %bb.7(0x80000000) + liveins: $r0, $r1, $r2 + + $lr = tMOVr killed $r2, 14, $noreg + $r12 = tMOVr killed $r0, 14, $noreg + $r3 = tMOVr killed $r1, 14, $noreg + + bb.7.while.body.preheader19: + successors: %bb.8(0x80000000) + liveins: $lr, $r3, $r12 + + renamable $r0, dead $cpsr = tSUBi3 killed renamable $r3, 4, 14, $noreg + renamable $r1 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.8.while.body: + successors: %bb.8(0x7c000000), %bb.9(0x04000000) + liveins: $lr, $r0, $r1 + + renamable $s0 = VLDRS renamable $r1, 1, 14, $noreg :: (load 4 from %ir.scevgep3) + renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 4, 14, $noreg + renamable $s0 = nnan ninf nsz arcp contract afn reassoc VABSS killed renamable $s0, 14, $noreg + VSTRS killed renamable $s0, renamable $r0, 1, 14, $noreg :: (store 4 into %ir.scevgep7) + renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 4, 14, $noreg + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.8, implicit-def dead $cpsr + tB %bb.9, 14, $noreg + + bb.9.while.end: + tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc + +... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-use-after.mir @@ -1,12 +1,10 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s -# CHECK-NOT: $lr = t2DLS -# CHECK: $lr = tMOVr $r0, 14 -# CHECK-NOT: $lr = t2LEUpdate --- | target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main" - + define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) { entry: %scevgep = getelementptr i32, i32* %q, i32 -1 @@ -16,7 +14,7 @@ preheader: br label %while.body - + while.body: ; preds = %while.body, %entry %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %preheader ] %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %preheader ] @@ -30,14 +28,14 @@ %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) %3 = icmp ne i32 %2, 0 br i1 %3, label %while.body, label %while.end - + while.end: ; preds = %while.body ret i32 0 } - + declare void @llvm.set.loop.iterations.i32(i32) #0 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 - + attributes #0 = { noduplicate nounwind } attributes #1 = { nounwind } @@ -78,20 +76,45 @@ restorePoint: '' fixedStack: [] stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: do_copy + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r2, $r7 + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: renamable $r0 = t2SUBri killed renamable $lr, 4, 14, $noreg, def dead $cpsr + ; CHECK: renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg + ; CHECK: bb.1.preheader: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1 + ; CHECK: $lr = tMOVr $r0, 14, $noreg + ; CHECK: bb.2.while.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1 + ; CHECK: renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6) + ; CHECK: early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2) + ; CHECK: $lr = t2SUBri killed renamable $lr, 1, 14, $noreg, def $cpsr + ; CHECK: tBcc %bb.2, 1, killed $cpsr + ; CHECK: tB %bb.3, 14, $noreg + ; CHECK: bb.3.while.end: + ; CHECK: $r0, dead $cpsr = tMOVi8 0, 14, $noreg + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 bb.0.entry: successors: %bb.1(0x80000000) liveins: $r0, $r1, $r2, $r7, $lr - + frame-setup tPUSH 14, $noreg, killed $r7, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -104,17 +127,17 @@ successors: %bb.2(0x80000000) liveins: $r0 $lr = tMOVr $r0, 14, $noreg - + bb.2.while.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1 - + renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6) early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2) renamable $lr = t2LoopDec killed renamable $lr, 1 t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr tB %bb.3, 14, $noreg - + bb.3.while.end: $r0, dead $cpsr = tMOVi8 0, 14, $noreg tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir @@ -30,55 +30,27 @@ %vctp = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp8) %and = and <4 x i1> %vctp, %invariant.mask %tmp11 = sub i32 %tmp8, 4 - %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef), !tbaa !3 + %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef) %tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer %tmp20 = and <4 x i1> %tmp18, %vctp - %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %tmp20, <4 x i32> undef), !tbaa !3 + %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %tmp20, <4 x i32> undef) %tmp23 = mul nsw <4 x i32> %tmp22, %tmp17 - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %tmp20), !tbaa !3 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %tmp20) %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp7, i32 1) %tmp13 = icmp ne i32 %tmp12, 0 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4 - br i1 %tmp13, label %bb9, label %bb27, !llvm.loop !7 + br i1 %tmp13, label %bb9, label %bb27 bb27: ; preds = %bb9, %bb ret void } - ; Function Attrs: argmemonly nounwind readonly willreturn - declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1 - ; Function Attrs: argmemonly nounwind willreturn - declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 - ; Function Attrs: noduplicate nounwind - declare void @llvm.set.loop.iterations.i32(i32) #3 - ; Function Attrs: noduplicate nounwind - declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 - ; Function Attrs: nounwind readnone - declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 - ; Function Attrs: nounwind readnone - declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #4 - ; Function Attrs: nounwind - declare void @llvm.stackprotector(i8*, i8**) #5 - - attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mve" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { argmemonly nounwind readonly willreturn "target-features"="+mve" } - attributes #2 = { argmemonly nounwind willreturn "target-features"="+mve" } - attributes #3 = { noduplicate nounwind "target-features"="+mve" } - attributes #4 = { nounwind readnone "target-features"="+mve" } - attributes #5 = { nounwind } - - !llvm.module.flags = !{!0, !1} - !llvm.ident = !{!2} - - !0 = !{i32 1, !"wchar_size", i32 4} - !1 = !{i32 1, !"min_enum_size", i32 4} - !2 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project.git 8f92f97150cbdd3b9f569570b8377db78ed61a9e)"} - !3 = !{!4, !4, i64 0} - !4 = !{!"int", !5, i64 0} - !5 = !{!"omnipotent char", !6, i64 0} - !6 = !{!"Simple C/C++ TBAA"} - !7 = distinct !{!7, !8} - !8 = !{!"llvm.loop.isvectorized", i32 1} + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) ... --- @@ -135,7 +107,7 @@ ; CHECK: bb.0.bb: ; CHECK: successors: %bb.3(0x30000000), %bb.1(0x50000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 - ; CHECK: frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 @@ -161,15 +133,15 @@ ; CHECK: renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0) ; CHECK: MVE_VPST 4, implicit $vpr ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr - ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3) + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg ; CHECK: MVE_VPST 4, implicit $vpr ; CHECK: renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr - ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4, !tbaa !3) + ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) ; CHECK: renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: MVE_VPST 8, implicit $vpr - ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3) + ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) ; CHECK: $r0 = tMOVr $r3, 14, $noreg ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.bb27: @@ -179,7 +151,7 @@ successors: %bb.3(0x30000000), %bb.1(0x50000000) liveins: $r0, $r1, $r2, $r3, $lr - frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 @@ -209,15 +181,15 @@ renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0) MVE_VPST 4, implicit $vpr renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr - renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3) + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg MVE_VPST 4, implicit $vpr renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr - renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4, !tbaa !3) + renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 MVE_VPST 8, implicit $vpr - MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3) + MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 $r0 = tMOVr $r3, 14, $noreg t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir @@ -29,53 +29,25 @@ %vctp = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp8) %and = and <4 x i1> %vctp, %invariant.mask %tmp11 = sub i32 %tmp8, 4 - %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef), !tbaa !3 - %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and, <4 x i32> undef), !tbaa !3 + %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef) + %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and, <4 x i32> undef) %tmp23 = mul nsw <4 x i32> %tmp22, %tmp17 - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and), !tbaa !3 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and) %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp7, i32 1) %tmp13 = icmp ne i32 %tmp12, 0 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4 - br i1 %tmp13, label %bb9, label %bb27, !llvm.loop !7 + br i1 %tmp13, label %bb9, label %bb27 bb27: ; preds = %bb9, %bb ret void } - ; Function Attrs: argmemonly nounwind readonly willreturn declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1 - ; Function Attrs: argmemonly nounwind willreturn declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 - ; Function Attrs: noduplicate nounwind declare void @llvm.set.loop.iterations.i32(i32) #3 - ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 - ; Function Attrs: nounwind readnone declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 - ; Function Attrs: nounwind readnone declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #4 - ; Function Attrs: nounwind - declare void @llvm.stackprotector(i8*, i8**) #5 - - attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mve" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { argmemonly nounwind readonly willreturn "target-features"="+mve" } - attributes #2 = { argmemonly nounwind willreturn "target-features"="+mve" } - attributes #3 = { noduplicate nounwind "target-features"="+mve" } - attributes #4 = { nounwind readnone "target-features"="+mve" } - attributes #5 = { nounwind } - - !llvm.module.flags = !{!0, !1} - !llvm.ident = !{!2} - - !0 = !{i32 1, !"wchar_size", i32 4} - !1 = !{i32 1, !"min_enum_size", i32 4} - !2 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project.git 8f92f97150cbdd3b9f569570b8377db78ed61a9e)"} - !3 = !{!4, !4, i64 0} - !4 = !{!"int", !5, i64 0} - !5 = !{!"omnipotent char", !6, i64 0} - !6 = !{!"Simple C/C++ TBAA"} - !7 = distinct !{!7, !8} - !8 = !{!"llvm.loop.isvectorized", i32 1} ... --- @@ -132,7 +104,7 @@ ; CHECK: bb.0.bb: ; CHECK: successors: %bb.3(0x30000000), %bb.1(0x50000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 - ; CHECK: frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 @@ -152,11 +124,11 @@ ; CHECK: liveins: $lr, $r0, $r1, $r3 ; CHECK: renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0) ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3) - ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4, !tbaa !3) + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) + ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) ; CHECK: renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: MVE_VPST 8, implicit $vpr - ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3) + ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) ; CHECK: $r0 = tMOVr $r3, 14, $noreg ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.bb27: @@ -166,7 +138,7 @@ successors: %bb.3(0x30000000), %bb.1(0x50000000) liveins: $r0, $r1, $r2, $r3, $lr - frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 @@ -196,12 +168,12 @@ renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0) MVE_VPST 2, implicit $vpr renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr - renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3) - renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4, !tbaa !3) + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) + renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 MVE_VPST 8, implicit $vpr - MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3) + MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 $r0 = tMOVr $r3, 14, $noreg t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir rename from llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir rename to llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subi3.mir @@ -1,20 +1,8 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s -# Local use after def, this mov is using r3: -# -# $r2 = tMOVr killed $r3, 14, $noreg -# -# We should optimise away the SUB - -# CHECK: bb.1.vector.body: -# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg -# CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - --- | - target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" - target triple = "thumbv8.1m.main-arm-unknown-eabi" - - define dso_local void @local_use_after_def(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { + define dso_local void @vctp_tsubi3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 @@ -38,53 +26,32 @@ %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* - %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7) - %9 = sub i32 %7, 4 - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 - %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %9 = sub i32 %7, 5 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef) %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8) %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) %12 = icmp ne i32 %11, 0 - br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 + br i1 %12, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } declare void @llvm.set.loop.iterations.i32(i32) #1 - declare <4 x i1> @llvm.arm.vctp32(i32) #2 + declare <4 x i1> @llvm.arm.mve.vctp32(i32) #2 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3 - declare void @llvm.stackprotector(i8*, i8**) #5 - - attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" } - attributes #1 = { noduplicate nounwind } - attributes #2 = { nounwind readnone } - attributes #3 = { argmemonly nounwind willreturn } - attributes #4 = { argmemonly nounwind readonly willreturn } - attributes #5 = { nounwind } - - !llvm.module.flags = !{!0, !1} - !llvm.ident = !{!2} - - !0 = !{i32 1, !"wchar_size", i32 4} - !1 = !{i32 1, !"min_enum_size", i32 4} - !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"} - !3 = !{!4, !4, i64 0} - !4 = !{!"int", !5, i64 0} - !5 = !{!"omnipotent char", !6, i64 0} - !6 = !{!"Simple C++ TBAA"} - !7 = distinct !{!7, !8} - !8 = !{!"llvm.loop.isvectorized", i32 1} ... --- -name: local_use_after_def +name: vctp_tsubi3 alignment: 2 exposesReturnsTwice: false legalized: false @@ -130,16 +97,36 @@ constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: vctp_tsubi3 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7 + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1719, align 4) + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) - liveins: $r0, $r1, $r2, $r3, $lr + liveins: $r0, $r1, $r2, $r3, $r7, $lr - frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 - $r7 = frame-setup tMOVr $sp, 14, $noreg - frame-setup CFI_INSTRUCTION def_cfa_register $r7 tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate @@ -156,14 +143,13 @@ renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg MVE_VPST 4, implicit $vpr - renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3) - renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3) - renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + renamable $r3, dead $cpsr = tSUBi3 killed renamable $r3, 4, 14, $noreg renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 MVE_VPST 8, implicit $vpr - renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3) + renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 - $r2 = tMOVr killed $r3, 14, $noreg t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr tB %bb.2, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir copy from llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir copy to llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri.mir @@ -1,20 +1,8 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s -# The CPSR is not dead: -# -# renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg -# -# We shouldn't optimise away the SUB. - -# CHECK: bb.1.vector.body: -# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg -# CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - --- | - target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" - target triple = "thumbv8.1m.main-arm-unknown-eabi" - - define dso_local void @CPSR_not_dead(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { + define dso_local void @vctp_tsubi3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 @@ -38,53 +26,31 @@ %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* - %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7) - %9 = sub i32 %7, 4 - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 - %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %9 = sub i32 %7, 5 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef) %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8) %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) %12 = icmp ne i32 %11, 0 - br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 + br i1 %12, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) #1 - declare <4 x i1> @llvm.arm.vctp32(i32) #2 - declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 - declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 - declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4 - declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3 - declare void @llvm.stackprotector(i8*, i8**) #5 - - attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" } - attributes #1 = { noduplicate nounwind } - attributes #2 = { nounwind readnone } - attributes #3 = { argmemonly nounwind willreturn } - attributes #4 = { argmemonly nounwind readonly willreturn } - attributes #5 = { nounwind } - - !llvm.module.flags = !{!0, !1} - !llvm.ident = !{!2} - - !0 = !{i32 1, !"wchar_size", i32 4} - !1 = !{i32 1, !"min_enum_size", i32 4} - !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"} - !3 = !{!4, !4, i64 0} - !4 = !{!"int", !5, i64 0} - !5 = !{!"omnipotent char", !6, i64 0} - !6 = !{!"Simple C++ TBAA"} - !7 = distinct !{!7, !8} - !8 = !{!"llvm.loop.isvectorized", i32 1} + declare void @llvm.set.loop.iterations.i32(i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) ... --- -name: CPSR_not_dead +name: vctp_tsubi3 alignment: 2 exposesReturnsTwice: false legalized: false @@ -130,16 +96,36 @@ constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: vctp_tsubi3 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7 + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1719, align 4) + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) - liveins: $r0, $r1, $r2, $r3, $lr + liveins: $r0, $r1, $r2, $r3, $r7, $lr - frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 - $r7 = frame-setup tMOVr $sp, 14, $noreg - frame-setup CFI_INSTRUCTION def_cfa_register $r7 tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate @@ -156,12 +142,12 @@ renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg MVE_VPST 4, implicit $vpr - renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3) - renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3) - renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + renamable $r3 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 MVE_VPST 8, implicit $vpr - renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3) + renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr tB %bb.2, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir copy from llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir copy to llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-subri12.mir @@ -1,20 +1,8 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s -# The CPSR is not dead: -# -# renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg -# -# We shouldn't optimise away the SUB. - -# CHECK: bb.1.vector.body: -# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg -# CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - --- | - target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" - target triple = "thumbv8.1m.main-arm-unknown-eabi" - - define dso_local void @CPSR_not_dead(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { + define dso_local void @vctp_tsubi3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 @@ -38,53 +26,31 @@ %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* - %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7) - %9 = sub i32 %7, 4 - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 - %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %9 = sub i32 %7, 5 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef) %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8) %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) %12 = icmp ne i32 %11, 0 - br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 + br i1 %12, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) #1 - declare <4 x i1> @llvm.arm.vctp32(i32) #2 - declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 - declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 - declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4 - declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3 - declare void @llvm.stackprotector(i8*, i8**) #5 - - attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" } - attributes #1 = { noduplicate nounwind } - attributes #2 = { nounwind readnone } - attributes #3 = { argmemonly nounwind willreturn } - attributes #4 = { argmemonly nounwind readonly willreturn } - attributes #5 = { nounwind } - - !llvm.module.flags = !{!0, !1} - !llvm.ident = !{!2} - - !0 = !{i32 1, !"wchar_size", i32 4} - !1 = !{i32 1, !"min_enum_size", i32 4} - !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"} - !3 = !{!4, !4, i64 0} - !4 = !{!"int", !5, i64 0} - !5 = !{!"omnipotent char", !6, i64 0} - !6 = !{!"Simple C++ TBAA"} - !7 = distinct !{!7, !8} - !8 = !{!"llvm.loop.isvectorized", i32 1} + declare void @llvm.set.loop.iterations.i32(i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) ... --- -name: CPSR_not_dead +name: vctp_tsubi3 alignment: 2 exposesReturnsTwice: false legalized: false @@ -130,16 +96,36 @@ constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: vctp_tsubi3 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7 + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1719, align 4) + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) - liveins: $r0, $r1, $r2, $r3, $lr + liveins: $r0, $r1, $r2, $r3, $r7, $lr - frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 - $r7 = frame-setup tMOVr $sp, 14, $noreg - frame-setup CFI_INSTRUCTION def_cfa_register $r7 tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate @@ -156,12 +142,12 @@ renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg MVE_VPST 4, implicit $vpr - renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3) - renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3) - renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + renamable $r3 = t2SUBri12 killed renamable $r3, 4, 14, $noreg renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 MVE_VPST 8, implicit $vpr - renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3) + renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr tB %bb.2, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vmaxmin_vpred_r.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vmaxmin_vpred_r.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vmaxmin_vpred_r.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vmaxmin_vpred_r.mir @@ -150,22 +150,21 @@ ; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -16 ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -20 ; CHECK: renamable $r12 = t2LDRi12 $sp, 44, 14, $noreg :: (load 4 from %fixed-stack.6, align 8) - ; CHECK: $lr = MVE_WLSTP_32 renamable $r12, %bb.3 + ; CHECK: $lr = MVE_WLSTP_32 killed renamable $r12, %bb.3 ; CHECK: bb.1.for.body.lr.ph: ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12 + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: $r7, $r6 = t2LDRDi8 $sp, 36, 14, $noreg :: (load 4 from %fixed-stack.4, align 8), (load 4 from %fixed-stack.5) ; CHECK: $r5, $r4 = t2LDRDi8 $sp, 20, 14, $noreg :: (load 4 from %fixed-stack.0, align 8), (load 4 from %fixed-stack.1) ; CHECK: renamable $q0 = MVE_VDUP32 killed renamable $r6, 0, $noreg, undef renamable $q0 ; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r7, 0, $noreg, undef renamable $q1 ; CHECK: bb.2.for.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r5, $r12 + ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r5 ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 4, 0, $noreg :: (load 16 from %ir.input_2_cast, align 4) ; CHECK: renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 4, 0, $noreg :: (load 16 from %ir.input_1_cast, align 4) ; CHECK: renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r3, 0, $noreg, undef renamable $q2 ; CHECK: renamable $q3 = MVE_VADD_qr_i32 killed renamable $q3, renamable $r2, 0, $noreg, undef renamable $q3 - ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg ; CHECK: renamable $q2 = MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2 ; CHECK: renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r4, 0, $noreg, undef renamable $q2 ; CHECK: renamable $q2 = MVE_VMAXu32 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vmldava_in_vpt.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vmldava_in_vpt.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vmldava_in_vpt.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vmldava_in_vpt.mir @@ -139,7 +139,7 @@ ; CHECK: bb.0.entry: ; CHECK: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7 - ; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, $r5, killed $r6, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 20 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 @@ -148,11 +148,10 @@ ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -20 ; CHECK: renamable $r7 = tLDRspi $sp, 10, 14, $noreg :: (load 4 from %fixed-stack.5) ; CHECK: renamable $r12 = t2MOVi 0, 14, $noreg, $noreg - ; CHECK: dead $lr = MVE_WLSTP_32 killed renamable $r7, %bb.3 + ; CHECK: $lr = MVE_WLSTP_32 killed renamable $r7, %bb.3 ; CHECK: bb.1.for.body.lr.ph: ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $r0, $r1, $r2, $r3, $r5 - ; CHECK: $r6 = tMOVr killed $r5, 14, $noreg + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: $r5, $r12 = t2LDRDi8 $sp, 32, 14, $noreg :: (load 4 from %fixed-stack.3), (load 4 from %fixed-stack.4, align 8) ; CHECK: renamable $r4 = tLDRspi $sp, 5, 14, $noreg :: (load 4 from %fixed-stack.0, align 8) ; CHECK: renamable $q0 = MVE_VDUP32 killed renamable $r12, 0, $noreg, undef renamable $q0 @@ -160,19 +159,17 @@ ; CHECK: renamable $r12 = t2MOVi 0, 14, $noreg, $noreg ; CHECK: bb.2.for.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r6, $r12 + ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r12 ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 4, 0, $noreg :: (load 16 from %ir.input_2_cast, align 4) ; CHECK: renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 4, 0, $noreg :: (load 16 from %ir.input_1_cast, align 4) ; CHECK: renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r3, 0, $noreg, undef renamable $q2 ; CHECK: renamable $q3 = MVE_VADD_qr_i32 killed renamable $q3, renamable $r2, 0, $noreg, undef renamable $q3 - ; CHECK: $lr = tMOVr $r6, 14, $noreg ; CHECK: renamable $q2 = MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2 - ; CHECK: renamable $r6, dead $cpsr = tSUBi8 killed $r6, 1, 14, $noreg ; CHECK: renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r4, 0, $noreg, undef renamable $q2 ; CHECK: renamable $q2 = MVE_VMAXu32 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 ; CHECK: renamable $q3 = MVE_VMINu32 renamable $q2, renamable $q0, 0, $noreg, undef renamable $q3 ; CHECK: renamable $r12 = MVE_VMLADAVas32 killed renamable $r12, killed renamable $q3, killed renamable $q2, 0, killed $noreg - ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.for.cond.cleanup: ; CHECK: liveins: $r12 ; CHECK: $r0 = tMOVr killed $r12, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir @@ -195,22 +195,21 @@ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 - ; CHECK: $lr = MVE_WLSTP_8 renamable $r3, %bb.1 + ; CHECK: $lr = MVE_WLSTP_8 killed renamable $r3, %bb.1 ; CHECK: tB %bb.3, 14, $noreg ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 + ; CHECK: liveins: $lr, $r0, $r1, $r2 ; CHECK: renamable $r12 = t2MOVi 0, 14, $noreg, $noreg ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.3(0x04000000), %bb.2(0x7c000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12 + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r12 ; CHECK: renamable $r4 = t2ADDrr renamable $r1, renamable $r12, 14, $noreg, $noreg ; CHECK: renamable $q0 = MVE_VLDRBU8 killed renamable $r4, 0, 0, $noreg :: (load 16 from %ir.scevgep45, align 1) ; CHECK: renamable $r4 = t2ADDrr renamable $r2, renamable $r12, 14, $noreg, $noreg ; CHECK: renamable $q1 = MVE_VLDRBU8 killed renamable $r4, 0, 0, $noreg :: (load 16 from %ir.scevgep23, align 1) ; CHECK: renamable $r4 = t2ADDrr renamable $r0, renamable $r12, 14, $noreg, $noreg ; CHECK: renamable $r12 = t2ADDri killed renamable $r12, 16, 14, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg ; CHECK: renamable $q0 = MVE_VMULi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: MVE_VSTRBU8 killed renamable $q0, killed renamable $r4, 0, 0, killed $noreg :: (store 16 into %ir.scevgep1, align 1) ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 @@ -318,11 +317,11 @@ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: $lr = MVE_WLSTP_16 renamable $r3, %bb.1 + ; CHECK: $lr = MVE_WLSTP_16 killed renamable $r3, %bb.1 ; CHECK: tB %bb.2, 14, $noreg ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 + ; CHECK: liveins: $lr, $r0, $r1, $r2 ; CHECK: renamable $q0 = MVE_VLDRHU16 renamable $r1, 0, 0, $noreg :: (load 16 from %ir.lsr.iv57, align 2) ; CHECK: renamable $q1 = MVE_VLDRHU16 renamable $r2, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 2) ; CHECK: renamable $q0 = MVE_VMULi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 @@ -330,7 +329,6 @@ ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg ; CHECK: renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 16, 14, $noreg ; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg - ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14, $noreg ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 ; CHECK: bb.2.for.cond.cleanup: ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc