Index: llvm/include/llvm/CodeGen/MachineMatcher.h =================================================================== --- /dev/null +++ llvm/include/llvm/CodeGen/MachineMatcher.h @@ -0,0 +1,78 @@ +#include + +namespace llvm { + +class MachineBasicBlock; +class MachineInstr; +class MachineOperand; +class ReachingDefAnalysis; + +// A small wrapper class around RDA which provides helpers for common +// function like finding a definition and testing its opcode. +class MachineMatcher { + ReachingDefAnalysis &RDA; + + using OpcodeMatcher = std::function; + using MIMatcher = std::function; + +public: + MachineMatcher() = delete; + MachineMatcher(ReachingDefAnalysis &RDA) : RDA(RDA) { } + + /// If the given operand index is a register, return the unique MachineInstr + /// that produces the definition. Return nullptr if a unique producer doesn't + /// exist. + MachineInstr *MIOperand(MachineInstr *MI, unsigned OpNum) const; + + /// Provide the unique machine instruction that produces the definition that + /// is consumed by MI via MO. Return nullptr if a unique producer doesn't + /// exist. + MachineInstr* MIOperand(MachineInstr *MI, MachineOperand &MO) const; + + /// Provide the unique machine instruction that produces the definition that + /// consumed by the first register use of MI. + MachineInstr* FirstMIUse(MachineInstr *MI) const; + + /// Provide the unique machine instruction that produces the definition that + /// consumed by the last register use of MI. + MachineInstr* LastMIUse(MachineInstr *MI) const; + + /// Test whether the first immediate use of MI is equal to Imm. + bool FirstImmUse(MachineInstr *MI, int64_t Imm) const; + + /// Provide the unique machine instruction that produces the definition that + /// consumed by the first register use of MI if its opcode matches. + MachineInstr* FirstUseOpcode(MachineInstr *MI, + OpcodeMatcher MatchOpcode) const; + + /// Provide the unique machine instruction that produces the definition that + /// consumed by the last register use of MI if its opcode matches. + MachineInstr* LastUseOpcode(MachineInstr *MI, + OpcodeMatcher MatchOpcode) const; + + /// Inspect the unique defining instruction that MI uses and test it using + /// the provided function. + MachineInstr* Operand(MachineInstr *MI, unsigned OpNum, + MIMatcher MIMatch) const; + + /// Inspect the unique defining instruction that MI uses and test its opcode + /// using the provided function. + MachineInstr* OperandOpcode(MachineInstr *MI, unsigned OpNum, + OpcodeMatcher MatchOpcode) const; + + /// Return the unique MachineInstr which produces the definition for the + /// given register at MI, if it matches. + MachineInstr* ReachingDef(MachineInstr *MI, int PhysReg, + MIMatcher MatchMI) const; + + /// Return the unique MachineInstr which produces the definition for liveout + /// of the given register in MBB, if it matches. + MachineInstr* LiveOut(MachineBasicBlock *MBB, int PhysReg, + MIMatcher MatchMI) const; + + /// Check that both instructions use the given register and return whether + /// both instructions use the same definition. + bool UsesSameValue(MachineInstr *A, MachineInstr *B, int PhysReg) const; +}; + +} // end namespace llvm Index: llvm/include/llvm/CodeGen/ReachingDefAnalysis.h =================================================================== --- llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -22,6 +22,7 @@ #define LLVM_CODEGEN_REACHINGDEFSANALYSIS_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/LoopTraversal.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -37,6 +38,7 @@ private: MachineFunction *MF; const TargetRegisterInfo *TRI; + LoopTraversal::TraversalOrder TraversedMBBOrder; unsigned NumRegUnits; /// Instruction that defined each register, relative to the beginning of the /// current basic block. When a LiveRegsDefInfo is used to represent a @@ -93,6 +95,15 @@ MachineFunctionProperties::Property::TracksLiveness); } + /// Re-run the analysis. + void reset(); + + /// Initialize data structures. + void init(); + + /// Traverse the machine function, mapping definitions. + void traverse(); + /// Provides the instruction id of the closest reaching def instruction of /// PhysReg that reaches MI, relative to the begining of MI's basic block. int getReachingDef(MachineInstr *MI, int PhysReg) const; @@ -148,6 +159,17 @@ void getGlobalUses(MachineInstr *MI, int PhysReg, InstSet &Uses) const; + /// Search for the instruction(s) which defines the given register. If no + /// instruction defines a live-out in MBB, then visit its predecessors. + void getLiveOuts(MachineBasicBlock *MBB, int PhysReg, + SetVector &Incoming, + SmallPtrSetImpl &VisitedBBs) const; + + /// Provide the only instruction to produce the value in the given register + /// which is available to MI, or nullptr is a unique instruction doesn't + /// exist. + MachineInstr *getUniqueReachingMIDef(MachineInstr *MI, int PhysReg) const; + /// Return whether From can be moved forwards to just before To. bool isSafeToMoveForwards(MachineInstr *From, MachineInstr *To) const; Index: llvm/lib/CodeGen/CMakeLists.txt =================================================================== --- llvm/lib/CodeGen/CMakeLists.txt +++ llvm/lib/CodeGen/CMakeLists.txt @@ -82,6 +82,7 @@ MachineLICM.cpp MachineLoopInfo.cpp MachineLoopUtils.cpp + MachineMatcher.cpp MachineModuleInfo.cpp MachineModuleInfoImpls.cpp MachineOperand.cpp Index: llvm/lib/CodeGen/MachineMatcher.cpp =================================================================== --- /dev/null +++ llvm/lib/CodeGen/MachineMatcher.cpp @@ -0,0 +1,106 @@ +#include "llvm/CodeGen/MachineMatcher.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/ReachingDefAnalysis.h" + +using namespace llvm; + +MachineInstr* MachineMatcher::MIOperand(MachineInstr *MI, unsigned OpNum) const { + assert(MI->getOperand(OpNum).isReg() && MI->getOperand(OpNum).isUse() && + "Not a register use"); + return RDA.getUniqueReachingMIDef(MI, MI->getOperand(OpNum).getReg()); +} + +MachineInstr* MachineMatcher::MIOperand(MachineInstr *MI, + MachineOperand &MO) const { + assert(MO.isReg() && MO.isUse() && "Not a register use"); + return RDA.getUniqueReachingMIDef(MI, MO.getReg()); +} + +MachineInstr* MachineMatcher::FirstMIUse(MachineInstr *MI) const { + for (auto &MO : MI->uses()) { + if (!MO.isReg() || MO.getReg() == 0) + continue; + return MIOperand(MI, MO); + } + return nullptr; +} + +MachineInstr* MachineMatcher::LastMIUse(MachineInstr *MI) const { + for (auto &MO : reverse(MI->uses())) { + if (!MO.isReg() || MO.getReg() == 0) + continue; + return MIOperand(MI, MO); + } + return nullptr; +} + +bool MachineMatcher::FirstImmUse(MachineInstr *MI, int64_t Imm) const { + for (auto &MO : MI->uses()) { + if (!MO.isImm()) + continue; + return MO.getImm() == Imm; + } + return false; +} + +MachineInstr* +MachineMatcher::FirstUseOpcode(MachineInstr *MI, + OpcodeMatcher MatchOpcode) const { + if (auto *Def = FirstMIUse(MI)) + if (MatchOpcode(Def->getOpcode())) + return Def; + return nullptr; +} + +MachineInstr* +MachineMatcher::LastUseOpcode(MachineInstr *MI, + OpcodeMatcher MatchOpcode) const { + if (auto *Def = LastMIUse(MI)) + if (MatchOpcode(Def->getOpcode())) + return Def; + return nullptr; +} + +MachineInstr* MachineMatcher::Operand(MachineInstr *MI, unsigned OpNum, + MIMatcher MatchMI) const { + if (auto *Def = MIOperand(MI, OpNum)) + if (MatchMI(Def)) + return Def; + return nullptr; +} + +MachineInstr* MachineMatcher::OperandOpcode(MachineInstr *MI, unsigned OpNum, + OpcodeMatcher MatchOpcode) const { + if (auto *Def = MIOperand(MI, OpNum)) + if (MatchOpcode(Def->getOpcode())) + return Def; + return nullptr; +} + +MachineInstr* MachineMatcher::ReachingDef(MachineInstr *MI, int PhysReg, + MIMatcher MatchMI) const { + if (auto *Def = RDA.getUniqueReachingMIDef(MI, PhysReg)) + if (MatchMI(Def)) + return Def; + return nullptr; +} + +MachineInstr* MachineMatcher::LiveOut(MachineBasicBlock *MBB, int PhysReg, + MIMatcher MatchMI) const { + if (auto *Def = RDA.getLocalLiveOutMIDef(MBB, PhysReg)) + if (MatchMI(Def)) + return Def; + return nullptr; +} + +bool MachineMatcher::UsesSameValue(MachineInstr *A, MachineInstr *B, + int PhysReg) const { + for (auto &MOA : A->uses()) + if (MOA.isReg() && MOA.getReg() == PhysReg) + for (auto &MOB : B->uses()) + if (MOB.isReg() && MOB.getReg() == PhysReg) + return RDA.hasSameReachingDef(A, B, PhysReg); + + return false; +} Index: llvm/lib/CodeGen/ReachingDefAnalysis.cpp =================================================================== --- llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -136,38 +136,44 @@ bool ReachingDefAnalysis::runOnMachineFunction(MachineFunction &mf) { MF = &mf; TRI = MF->getSubtarget().getRegisterInfo(); + LLVM_DEBUG(dbgs() << "********** REACHING DEFINITION ANALYSIS **********\n"); + init(); + traverse(); + return false; +} - LiveRegs.clear(); - NumRegUnits = TRI->getNumRegUnits(); - - MBBReachingDefs.resize(mf.getNumBlockIDs()); +void ReachingDefAnalysis::releaseMemory() { + // Clear the internal vectors. + MBBOutRegsInfos.clear(); + MBBReachingDefs.clear(); + InstIds.clear(); +} - LLVM_DEBUG(dbgs() << "********** REACHING DEFINITION ANALYSIS **********\n"); +void ReachingDefAnalysis::reset() { + init(); + traverse(); +} +void ReachingDefAnalysis::init() { + LiveRegs.clear(); + InstIds.clear(); + NumRegUnits = TRI->getNumRegUnits(); + MBBReachingDefs.resize(MF->getNumBlockIDs()); // Initialize the MBBOutRegsInfos - MBBOutRegsInfos.resize(mf.getNumBlockIDs()); + MBBOutRegsInfos.resize(MF->getNumBlockIDs()); + LoopTraversal Traversal; + TraversedMBBOrder = Traversal.traverse(*MF); +} +void ReachingDefAnalysis::traverse() { // Traverse the basic blocks. - LoopTraversal Traversal; - LoopTraversal::TraversalOrder TraversedMBBOrder = Traversal.traverse(mf); - for (LoopTraversal::TraversedMBBInfo TraversedMBB : TraversedMBBOrder) { + for (LoopTraversal::TraversedMBBInfo TraversedMBB : TraversedMBBOrder) processBasicBlock(TraversedMBB); - } - // Sorting all reaching defs found for a ceartin reg unit in a given BB. for (MBBDefsInfo &MBBDefs : MBBReachingDefs) { for (MBBRegUnitDefs &RegUnitDefs : MBBDefs) llvm::sort(RegUnitDefs); } - - return false; -} - -void ReachingDefAnalysis::releaseMemory() { - // Clear the internal vectors. - MBBOutRegsInfos.clear(); - MBBReachingDefs.clear(); - InstIds.clear(); } int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, int PhysReg) const { @@ -256,9 +262,8 @@ } } -bool -ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, int PhysReg, - InstSet &Uses) const { +bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, int PhysReg, + InstSet &Uses) const { for (auto &MI : *MBB) { if (MI.isDebugInstr()) continue; @@ -273,9 +278,8 @@ return isReachingDefLiveOut(&MBB->back(), PhysReg); } -void -ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, int PhysReg, - InstSet &Uses) const { +void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, int PhysReg, + InstSet &Uses) const { MachineBasicBlock *MBB = MI->getParent(); // Collect the uses that each def touches within the block. @@ -303,6 +307,42 @@ } } +void +ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, int PhysReg, + SetVector &Incoming, + SmallPtrSetImpl &VisitedBBs) const { + if (VisitedBBs.count(MBB)) + return; + + VisitedBBs.insert(MBB); + LivePhysRegs LiveRegs(*TRI); + LiveRegs.addLiveOuts(*MBB); + if (!LiveRegs.contains(PhysReg)) + return; + + if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg)) + Incoming.insert(Def); + else { + for (auto *Pred : MBB->predecessors()) + getLiveOuts(Pred, PhysReg, Incoming, VisitedBBs); + } +} + +MachineInstr *ReachingDefAnalysis::getUniqueReachingMIDef(MachineInstr *MI, + int PhysReg) const { + if (auto *Def = getReachingMIDef(MI, PhysReg)) + return Def; + + SmallPtrSet VisitedBBs; + SetVector Incoming; + for (auto *Pred : MI->getParent()->predecessors()) + getLiveOuts(Pred, PhysReg, Incoming, VisitedBBs); + + if (Incoming.size() == 1) + return Incoming.front(); + return nullptr; +} + bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) const { MachineBasicBlock *MBB = MI->getParent(); LivePhysRegs LiveRegs(*TRI); Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -662,6 +662,10 @@ Opc == ARM::t2SUBri || Opc == ARM::t2SUBri12 || Opc == ARM::t2SUBSri; } +static inline bool isLSRImmOpcode(int Opc) { + return Opc == ARM::LSRi || Opc == ARM::tLSRri || Opc == ARM::t2LSRri; +} + static inline bool isMovRegOpcode(int Opc) { return Opc == ARM::MOVr || Opc == ARM::tMOVr || Opc == ARM::t2MOVr; } Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -49,6 +49,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineLoopUtils.h" +#include "llvm/CodeGen/MachineMatcher.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/ReachingDefAnalysis.h" @@ -175,9 +176,11 @@ struct LowOverheadLoop { - MachineLoop *ML = nullptr; - MachineLoopInfo *MLI = nullptr; - ReachingDefAnalysis *RDA = nullptr; + MachineLoop &ML; + MachineLoopInfo &MLI; + const TargetRegisterInfo &TRI; + const ARMBaseInstrInfo &TII; + ReachingDefAnalysis &RDA; MachineFunction *MF = nullptr; MachineInstr *InsertPt = nullptr; MachineInstr *Start = nullptr; @@ -191,9 +194,11 @@ bool Revert = false; bool CannotTailPredicate = false; - LowOverheadLoop(MachineLoop *ML, MachineLoopInfo *MLI, - ReachingDefAnalysis *RDA) : ML(ML), MLI(MLI), RDA(RDA) { - MF = ML->getHeader()->getParent(); + LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI, + const TargetRegisterInfo &TRI, const ARMBaseInstrInfo &TII, + ReachingDefAnalysis &RDA) + : ML(ML), MLI(MLI), TRI(TRI), TII(TII), RDA(RDA) { + MF = ML.getHeader()->getParent(); } // If this is an MVE instruction, check that we know how to use tail @@ -209,11 +214,15 @@ // For now, let's keep things really simple and only support a single // block for tail predication. return !Revert && FoundAllComponents() && VCTP && - !CannotTailPredicate && ML->getNumBlocks() == 1; + !CannotTailPredicate && ML.getNumBlocks() == 1; } bool ValidateTailPredicate(MachineInstr *StartInsertPt); + // Check whether any of MIs defs are live-out, and if so, whether they're + // predicated by the VCTP. + bool ValidateLiveOuts(MachineInstr *MI); + // Is it safe to define LR with DLS/WLS? // LR can be defined if it is the operand to start, because it's the same // value, or if it's going to be equivalent to the operand to Start. @@ -322,29 +331,125 @@ unsigned CountReg = Start->getOperand(0).getReg(); auto IsMoveLR = [&CountReg](MachineInstr *MI) { return MI->getOpcode() == ARM::tMOVr && - MI->getOperand(0).getReg() == ARM::LR && MI->getOperand(1).getReg() == CountReg && MI->getOperand(2).getImm() == ARMCC::AL; }; MachineBasicBlock *MBB = Start->getParent(); + MachineMatcher Matcher(RDA); // Find an insertion point: // - Is there a (mov lr, Count) before Start? If so, and nothing else writes // to Count before Start, we can insert at that mov. - if (auto *LRDef = RDA->getReachingMIDef(Start, ARM::LR)) - if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg)) + if (auto *LRDef = Matcher.ReachingDef(Start, ARM::LR, IsMoveLR)) + if (RDA.hasSameReachingDef(Start, LRDef, CountReg)) return LRDef; // - Is there a (mov lr, Count) after Start? If so, and nothing else writes // to Count after Start, we can insert at that mov. - if (auto *LRDef = RDA->getLocalLiveOutMIDef(MBB, ARM::LR)) - if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg)) + if (auto *LRDef = Matcher.LiveOut(MBB, ARM::LR, IsMoveLR)) + if (RDA.hasSameReachingDef(Start, LRDef, CountReg)) return LRDef; // We've found no suitable LR def and Start doesn't use LR directly. Can we // just define LR anyway? - return RDA->isSafeToDefRegAt(Start, ARM::LR) ? Start : nullptr; + return RDA.isSafeToDefRegAt(Start, ARM::LR) ? Start : nullptr; +} + +bool LowOverheadLoop::ValidateLiveOuts(MachineInstr *MI) { + const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID); + SmallVector LiveOuts; + for (auto &MO : MI->operands()) { + if (!MO.isReg() || MO.getReg() == 0 || !QPRs->contains(MO.getReg())) + continue; + else if (MO.isDef() && isRegLiveInExitBlocks(&ML, MO.getReg())) + LiveOuts.push_back(MO.getReg()); + } + + if (LiveOuts.empty()) + return true; + + LLVM_DEBUG(dbgs() << "ARM Loops: Validating whether MI is safe live out: " + << *MI); + + // Return whether the vpsel is predicated on the same value of the vctp in + // the last iteration. + auto UsesEquivalentPredicate = [this](MachineInstr *Use) { + MachineMatcher Matcher(RDA); + unsigned VPRIdx = llvm::findFirstVPTPredOperandIdx(*Use) + 1; + MachineInstr *Pred = + Matcher.OperandOpcode(Use, VPRIdx, [this](unsigned Opcode) { + return Opcode == VCTP->getOpcode(); + }); + + if (!Pred) { + LLVM_DEBUG(dbgs() << "ARM Loops: Not predicated on a VCTP.\n"); + return false; + } + + // We expect the element count to either be defined in the loop body or the + // preheader. + MachineInstr *ExitBlockElems = Matcher.MIOperand(Pred, 1); + if (!ExitBlockElems || TII.getPredicate(*ExitBlockElems) != ARMCC::AL) + return false; + + // First, check if the VCTP is using the element count produced from within + // the loop. + if (isMovRegOpcode(ExitBlockElems->getOpcode())) + return Matcher.UsesSameValue(ExitBlockElems, VCTP, + VCTP->getOperand(1).getReg()); + + // Also check if the VCTP is using the exiting element count calculated in + // the preheader. The instructions will look like something like this, where + // X is the vector factor: + // BackedgeCount = (SUB (BIC (ADD TotalElems, X-1), X-1), X) + // TripCount = (ADD BackedgeCount, 1) + // ExitBlockElems = (SUB TotalElems, (LSR BackedgeCount, log2(X))) + + MachineInstr *TripCount = Matcher.MIOperand(Start, 0); + if (!TripCount) + return false; + + if (auto *LSR = Matcher.LastUseOpcode(ExitBlockElems, isLSRImmOpcode)) { + unsigned ShiftAmt = log2(getTailPredVectorWidth(VCTP->getOpcode())); + if (Matcher.FirstImmUse(LSR, ShiftAmt) && + TII.getPredicate(*LSR) == ARMCC::AL) { + if (auto *BackedgeCount = Matcher.FirstMIUse(LSR)) + return BackedgeCount == Matcher.LastMIUse(TripCount); + } + } + + LLVM_DEBUG(dbgs() << "ARM Loops: Failed to match predicate.\n"); + return false; + }; + + + LLVM_DEBUG(dbgs() << "ARM Loops: Inspecting loop live-outs.\n"); + SmallVector ExitBlocks; + ML.getExitBlocks(ExitBlocks); + + // Any live-out values should be somehow predicated upon a vctp that is + // equivalent to the predication happening within loop. Otherwise, when + // we perform tail-predication, we may be predicating instructions that + // should not be predicated. + for (auto *MBB : ExitBlocks) { + for (auto Reg : LiveOuts) { + SmallPtrSet Uses; + RDA.getLiveInUses(MBB, Reg, Uses); + + for (auto *Use : Uses) { + LLVM_DEBUG(dbgs() << "ARM Loops: Live out use: " << *Use); + if (Use->getOpcode() != ARM::MVE_VPSEL) + return false; + + if (!UsesEquivalentPredicate(Use)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Not using equivalent predicate.\n"); + return false; + } + } + } + } + return true; } bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) { @@ -381,7 +486,7 @@ // If the register is defined within loop, then we can't perform TP. // TODO: Check whether this is just a mov of a register that would be // available. - if (RDA->hasLocalDefBefore(VCTP, NumElements)) { + if (RDA.hasLocalDefBefore(VCTP, NumElements)) { LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n"); return false; } @@ -390,14 +495,14 @@ // need to try to move either InsertPt or the def so that the [w|d]lstp can // use the value. MachineBasicBlock *InsertBB = StartInsertPt->getParent(); - if (!RDA->isReachingDefLiveOut(StartInsertPt, NumElements)) { - if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) { - if (RDA->isSafeToMoveForwards(ElemDef, StartInsertPt)) { + if (!RDA.isReachingDefLiveOut(StartInsertPt, NumElements)) { + if (auto *ElemDef = RDA.getLocalLiveOutMIDef(InsertBB, NumElements)) { + if (RDA.isSafeToMoveForwards(ElemDef, StartInsertPt)) { ElemDef->removeFromParent(); InsertBB->insert(MachineBasicBlock::iterator(StartInsertPt), ElemDef); LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: " << *ElemDef); - } else if (RDA->isSafeToMoveBackwards(StartInsertPt, ElemDef)) { + } else if (RDA.isSafeToMoveBackwards(StartInsertPt, ElemDef)) { StartInsertPt->removeFromParent(); InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), StartInsertPt); @@ -416,7 +521,7 @@ auto CannotProvideElements = [this](MachineBasicBlock *MBB, Register NumElements) { // NumElements is redefined in this block. - if (RDA->hasLocalDefBefore(&MBB->back(), NumElements)) + if (RDA.hasLocalDefBefore(&MBB->back(), NumElements)) return true; // Don't continue searching up through multiple predecessors. @@ -427,7 +532,7 @@ }; // First, find the block that looks like the preheader. - MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true); + MachineBasicBlock *MBB = MLI.findLoopPreheader(&ML, true); if (!MBB) { LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find preheader.\n"); return false; @@ -464,12 +569,12 @@ }; MBB = VCTP->getParent(); - if (MachineInstr *Def = RDA->getReachingMIDef(&MBB->back(), NumElements)) { + if (MachineInstr *Def = RDA.getReachingMIDef(&MBB->back(), NumElements)) { SmallPtrSet ElementChain; SmallPtrSet Ignore = { VCTP }; unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode()); - if (RDA->isSafeToRemove(Def, ElementChain, Ignore)) { + if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) { bool FoundSub = false; for (auto *MI : ElementChain) { @@ -490,6 +595,13 @@ ToRemove.insert(ElementChain.begin(), ElementChain.end()); } } + + for (auto *MBB : ML.getBlocks()) + for (auto &MI : *MBB) + if (!ValidateLiveOuts(&MI)) + return false; + + LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n"); return true; } @@ -502,7 +614,7 @@ // TODO Maybe there's cases where the target doesn't have to be the header, // but for now be safe and revert. - if (End->getOperand(1).getMBB() != ML->getHeader()) { + if (End->getOperand(1).getMBB() != ML.getHeader()) { LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n"); Revert = true; return; @@ -510,8 +622,8 @@ // The WLS and LE instructions have 12-bits for the label offset. WLS // requires a positive offset, while LE uses negative. - if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) || - !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) { + if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) || + !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) { LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); Revert = true; return; @@ -541,7 +653,7 @@ return; } - assert(ML->getBlocks().size() == 1 && + assert(ML.getBlocks().size() == 1 && "Shouldn't be processing a loop with more than one block"); CannotTailPredicate = !ValidateTailPredicate(InsertPt); LLVM_DEBUG(if (CannotTailPredicate) @@ -615,7 +727,6 @@ LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI); return false; } - return true; } @@ -678,7 +789,7 @@ return nullptr; }; - LowOverheadLoop LoLoop(ML, MLI, RDA); + LowOverheadLoop LoLoop(*ML, *MLI, *TRI, *TII, *RDA); // Search the preheader for the start intrinsic. // FIXME: I don't see why we shouldn't be supporting multiple predecessors // with potentially multiple set.loop.iterations, so we need to enable this. @@ -823,11 +934,9 @@ LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n"); // When using tail-predication, try to delete the dead code that was used to // calculate the number of loop iterations. + MachineMatcher Matcher(*RDA); if (LoLoop.IsTailPredicationLegal()) { - SmallVector Killed; - SmallVector Dead; - if (auto *Def = RDA->getReachingMIDef(LoLoop.Start, - LoLoop.Start->getOperand(0).getReg())) { + if (auto *Def = Matcher.MIOperand(LoLoop.Start, 0)) { SmallPtrSet Remove; SmallPtrSet Ignore = { LoLoop.Start, LoLoop.Dec, LoLoop.End, LoLoop.InsertPt }; @@ -839,7 +948,7 @@ for (auto &MO : MI->operands()) { if (!MO.isReg() || !MO.isUse() || MO.getReg() == 0) continue; - if (auto *Op = RDA->getReachingMIDef(MI, MO.getReg())) + if (auto *Op = Matcher.MIOperand(MI, MO)) Chain.push_back(Op); } Ignore.insert(MI); @@ -1005,7 +1114,7 @@ } } - PostOrderLoopTraversal DFS(*LoLoop.ML, *MLI); + PostOrderLoopTraversal DFS(LoLoop.ML, *MLI); DFS.ProcessLoop(); const SmallVectorImpl &PostOrder = DFS.getOrder(); for (auto *MBB : PostOrder) { @@ -1017,6 +1126,8 @@ for (auto *MBB : reverse(PostOrder)) recomputeLivenessFlags(*MBB); + + RDA->reset(); } bool ARMLowOverheadLoops::RevertNonLoops() { Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir @@ -0,0 +1,186 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s + +# Test that the scalar register that aliases a Q reg prevents the tail +# predication. + +--- | + define dso_local i32 @no_vpsel_liveout(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { + entry: + %cmp9 = icmp eq i32 %N, 0 + %tmp = add i32 %N, 3 + %tmp1 = lshr i32 %tmp, 2 + %tmp2 = shl nuw i32 %tmp1, 2 + %tmp3 = add i32 %tmp2, -4 + %tmp4 = lshr i32 %tmp3, 2 + %tmp5 = add nuw nsw i32 %tmp4, 1 + br i1 %cmp9, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %tmp5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ] + %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] + %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>* + %lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>* + %tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7) + %tmp9 = sub i32 %tmp7, 4 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef) + %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef) + %tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32> + %tmp12 = mul nsw <4 x i32> %tmp11, %tmp10 + %tmp13 = add <4 x i32> %tmp12, %vec.phi + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4 + %tmp14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1) + %tmp15 = icmp ne i32 %tmp14, 0 + %lsr.iv.next = add nsw i32 %lsr.iv1, -1 + br i1 %tmp15, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %tmp16 = extractelement <4 x i32> %tmp13, i32 3 + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp16, %middle.block ] + ret i32 %res.0.lcssa + } + declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + +... +--- +name: no_vpsel_liveout +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: no_vpsel_liveout + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: dead $lr = t2DLS renamable $r12 + ; CHECK: $r3 = tMOVr killed $r12, 14, $noreg + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $q0, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + ; CHECK: $lr = tMOVr $r3, 14, $noreg + ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14, $noreg + ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0 + ; CHECK: $r0 = VMOVRS killed $s3, 14, $noreg, implicit killed $q0 + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $lr, $r7 + + tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $r12 + $r3 = tMOVr killed $r12, 14, $noreg + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $q0, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + $lr = tMOVr $r3, 14, $noreg + renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14, $noreg + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg + renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0 + + $r0 = VMOVRS killed $s3, 14, $noreg, implicit $q0 + tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-liveout-vals.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-liveout-vals.mir @@ -0,0 +1,772 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s +--- | + define dso_local arm_aapcs_vfpcc zeroext i8 @correct_vctp8_reduce(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) { + entry: + %cmp11 = icmp eq i32 %N, 0 + %0 = add i32 %N, 15 + %1 = lshr i32 %0, 4 + %2 = shl nuw i32 %1, 4 + %3 = add i32 %2, -16 + %4 = lshr i32 %3, 4 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp11, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + %6 = shl i32 %4, 4 + %7 = sub i32 %N, %6 + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv20 = phi i8* [ %scevgep21, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %13, %vector.body ] + %8 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ] + %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ] + %lsr.iv2022 = bitcast i8* %lsr.iv20 to <16 x i8>* + %lsr.iv19 = bitcast i8* %lsr.iv to <16 x i8>* + %10 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %9) + %11 = sub i32 %9, 16 + %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv19, i32 1, <16 x i1> %10, <16 x i8> undef) + %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv2022, i32 1, <16 x i1> %10, <16 x i8> undef) + %12 = add <16 x i8> %wide.masked.load, %vec.phi + %13 = add <16 x i8> %12, %wide.masked.load16 + %scevgep = getelementptr i8, i8* %lsr.iv, i32 16 + %scevgep21 = getelementptr i8, i8* %lsr.iv20, i32 16 + %14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %8, i32 1) + %15 = icmp ne i32 %14, 0 + br i1 %15, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <16 x i8> [ %vec.phi, %vector.body ] + %.lcssa = phi <16 x i8> [ %13, %vector.body ] + %16 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %7) + %17 = select <16 x i1> %16, <16 x i8> %.lcssa, <16 x i8> %vec.phi.lcssa + %18 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %17) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i8 [ 0, %entry ], [ %18, %middle.block ] + ret i8 %res.0.lcssa + } + + ; Function Attrs: norecurse nounwind readonly + define dso_local arm_aapcs_vfpcc zeroext i8 @incorrect_vctp8_reduce(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { + entry: + %cmp11 = icmp eq i32 %N, 0 + %0 = add i32 %N, 15 + %1 = lshr i32 %0, 4 + %2 = shl nuw i32 %1, 4 + %3 = add i32 %2, -16 + %4 = lshr i32 %3, 3 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp11, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + %6 = shl i32 %4, 4 + %7 = sub i32 %N, %6 + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv20 = phi i8* [ %scevgep21, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %13, %vector.body ] + %8 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ] + %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ] + %lsr.iv2022 = bitcast i8* %lsr.iv20 to <16 x i8>* + %lsr.iv19 = bitcast i8* %lsr.iv to <16 x i8>* + %10 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %9) + %11 = sub i32 %9, 16 + %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv19, i32 1, <16 x i1> %10, <16 x i8> undef) + %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv2022, i32 1, <16 x i1> %10, <16 x i8> undef) + %12 = add <16 x i8> %wide.masked.load, %vec.phi + %13 = add <16 x i8> %12, %wide.masked.load16 + %scevgep = getelementptr i8, i8* %lsr.iv, i32 16 + %scevgep21 = getelementptr i8, i8* %lsr.iv20, i32 16 + %14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %8, i32 1) + %15 = icmp ne i32 %14, 0 + br i1 %15, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <16 x i8> [ %vec.phi, %vector.body ] + %.lcssa = phi <16 x i8> [ %13, %vector.body ] + %16 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %7) + %17 = select <16 x i1> %16, <16 x i8> %.lcssa, <16 x i8> %vec.phi.lcssa + %18 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %17) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i8 [ 0, %entry ], [ %18, %middle.block ] + ret i8 %res.0.lcssa + } + + ; Function Attrs: norecurse nounwind readonly + define dso_local arm_aapcs_vfpcc zeroext i16 @correct_vctp16_reduce(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { + entry: + %cmp12 = icmp eq i32 %N, 0 + %0 = add i32 %N, 7 + %1 = lshr i32 %0, 3 + %2 = shl nuw i32 %1, 3 + %3 = add i32 %2, -8 + %4 = lshr i32 %3, 3 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp12, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + %6 = shl i32 %4, 3 + %7 = sub i32 %N, %6 + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv21 = phi i16* [ %scevgep22, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %13, %vector.body ] + %8 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ] + %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ] + %lsr.iv2123 = bitcast i16* %lsr.iv21 to <8 x i16>* + %lsr.iv20 = bitcast i16* %lsr.iv to <8 x i16>* + %10 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %9) + %11 = sub i32 %9, 8 + %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv20, i32 2, <8 x i1> %10, <8 x i16> undef) + %wide.masked.load17 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv2123, i32 2, <8 x i1> %10, <8 x i16> undef) + %12 = add <8 x i16> %wide.masked.load, %vec.phi + %13 = add <8 x i16> %12, %wide.masked.load17 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 8 + %scevgep22 = getelementptr i16, i16* %lsr.iv21, i32 8 + %14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %8, i32 1) + %15 = icmp ne i32 %14, 0 + br i1 %15, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <8 x i16> [ %vec.phi, %vector.body ] + %.lcssa = phi <8 x i16> [ %13, %vector.body ] + %16 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7) + %17 = select <8 x i1> %16, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa + %18 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %17) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i16 [ 0, %entry ], [ %18, %middle.block ] + ret i16 %res.0.lcssa + } + + ; Function Attrs: norecurse nounwind readonly + define dso_local arm_aapcs_vfpcc zeroext i16 @incorrect_vctp16_reduce(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { + entry: + %cmp12 = icmp eq i32 %N, 0 + %0 = add i32 %N, 7 + %1 = lshr i32 %0, 3 + %2 = shl nuw i32 %1, 3 + %3 = add i32 %2, -8 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp12, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + %6 = shl i32 %4, 3 + %7 = sub i32 %N, %6 + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv21 = phi i16* [ %scevgep22, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %13, %vector.body ] + %8 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ] + %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ] + %lsr.iv2123 = bitcast i16* %lsr.iv21 to <8 x i16>* + %lsr.iv20 = bitcast i16* %lsr.iv to <8 x i16>* + %10 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %9) + %11 = sub i32 %9, 8 + %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv20, i32 2, <8 x i1> %10, <8 x i16> undef) + %wide.masked.load17 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv2123, i32 2, <8 x i1> %10, <8 x i16> undef) + %12 = add <8 x i16> %wide.masked.load, %vec.phi + %13 = add <8 x i16> %12, %wide.masked.load17 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 8 + %scevgep22 = getelementptr i16, i16* %lsr.iv21, i32 8 + %14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %8, i32 1) + %15 = icmp ne i32 %14, 0 + br i1 %15, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <8 x i16> [ %vec.phi, %vector.body ] + %.lcssa = phi <8 x i16> [ %13, %vector.body ] + %16 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7) + %17 = select <8 x i1> %16, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa + %18 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %17) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i16 [ 0, %entry ], [ %18, %middle.block ] + ret i16 %res.0.lcssa + } + + declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1 + declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) #2 + declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) #1 + declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) #2 + declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 + declare <16 x i1> @llvm.arm.mve.vctp8(i32) #4 + declare <8 x i1> @llvm.arm.mve.vctp16(i32) #4 + +... +--- +name: correct_vctp8_reduce +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: correct_vctp8_reduce + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 2, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: renamable $r0 = tUXTB killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $r3 = t2ADDri renamable $r2, 15, 14, $noreg, $noreg + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 15, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 16, 14, $noreg, $noreg + ; CHECK: renamable $r3 = t2LSRri killed renamable $r12, 4, 14, $noreg, $noreg + ; CHECK: renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 34, 14, $noreg, $noreg + ; CHECK: $lr = MVE_DLSTP_8 killed renamable $r2 + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r3 + ; CHECK: $q1 = MVE_VORR killed $q0, killed $q0, 0, $noreg, undef $q1 + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRBU8_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv2022, align 1) + ; CHECK: renamable $r0, renamable $q2 = MVE_VLDRBU8_post killed renamable $r0, 16, 0, killed $noreg :: (load 16 from %ir.lsr.iv19, align 1) + ; CHECK: renamable $q2 = MVE_VADDi8 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 + ; CHECK: renamable $q0 = MVE_VADDi8 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1, $r3 + ; CHECK: renamable $vpr = MVE_VCTP8 killed renamable $r3, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu8no_acc killed renamable $q0, 0, $noreg + ; CHECK: $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r7, def $lr + ; CHECK: renamable $r0 = tUXTB killed renamable $r0, 14, $noreg + ; CHECK: tBX_RET 14, $noreg, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $lr + + tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 2, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + renamable $r0 = tUXTB killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r3 = t2ADDri renamable $r2, 15, 14, $noreg, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + renamable $r3 = t2BICri killed renamable $r3, 15, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 16, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r12, 35, 14, $noreg, $noreg + renamable $r3 = t2LSRri killed renamable $r12, 4, 14, $noreg, $noreg + renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 34, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP8 renamable $r2, 0, $noreg + $q1 = MVE_VORR killed $q0, $q0, 0, $noreg, undef $q1 + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv2022, align 1) + renamable $r0, renamable $q2 = MVE_VLDRBU8_post killed renamable $r0, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv19, align 1) + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 16, 14, $noreg + renamable $q2 = MVE_VADDi8 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q0 = MVE_VADDi8 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1, $r3 + + renamable $vpr = MVE_VCTP8 killed renamable $r3, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu8no_acc killed renamable $q0, 0, $noreg + $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r7, def $lr + renamable $r0 = tUXTB killed renamable $r0, 14, $noreg + tBX_RET 14, $noreg, implicit killed $r0 + +... +--- +name: incorrect_vctp8_reduce +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: incorrect_vctp8_reduce + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 2, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: renamable $r0 = tUXTB killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $r3 = t2ADDri renamable $r2, 15, 14, $noreg, $noreg + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 15, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 16, 14, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r12, 35, 14, $noreg, $noreg + ; CHECK: renamable $r3 = t2LSRri killed renamable $r12, 3, 14, $noreg, $noreg + ; CHECK: renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 34, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP8 renamable $r2, 0, $noreg + ; CHECK: $q1 = MVE_VORR killed $q0, killed $q0, 0, $noreg, undef $q1 + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv2022, align 1) + ; CHECK: renamable $r0, renamable $q2 = MVE_VLDRBU8_post killed renamable $r0, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv19, align 1) + ; CHECK: renamable $q2 = MVE_VADDi8 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 + ; CHECK: renamable $q0 = MVE_VADDi8 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1, $r3 + ; CHECK: renamable $vpr = MVE_VCTP8 killed renamable $r3, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu8no_acc killed renamable $q0, 0, $noreg + ; CHECK: $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r7, def $lr + ; CHECK: renamable $r0 = tUXTB killed renamable $r0, 14, $noreg + ; CHECK: tBX_RET 14, $noreg, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $lr + + tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 2, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + renamable $r0 = tUXTB killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r3 = t2ADDri renamable $r2, 15, 14, $noreg, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + renamable $r3 = t2BICri killed renamable $r3, 15, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 16, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r12, 35, 14, $noreg, $noreg + renamable $r3 = t2LSRri killed renamable $r12, 3, 14, $noreg, $noreg + renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 34, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP8 renamable $r2, 0, $noreg + $q1 = MVE_VORR killed $q0, $q0, 0, $noreg, undef $q1 + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv2022, align 1) + renamable $r0, renamable $q2 = MVE_VLDRBU8_post killed renamable $r0, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv19, align 1) + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 16, 14, $noreg + renamable $q2 = MVE_VADDi8 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q0 = MVE_VADDi8 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1, $r3 + + renamable $vpr = MVE_VCTP8 killed renamable $r3, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu8no_acc killed renamable $q0, 0, $noreg + $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r7, def $lr + renamable $r0 = tUXTB killed renamable $r0, 14, $noreg + tBX_RET 14, $noreg, implicit killed $r0 + +... +--- +name: correct_vctp16_reduce +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: correct_vctp16_reduce + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 2, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: renamable $r0 = tUXTH killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 7, 14, $noreg + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 7, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 8, 14, $noreg, $noreg + ; CHECK: renamable $r3 = t2LSRri killed renamable $r12, 3, 14, $noreg, $noreg + ; CHECK: renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 26, 14, $noreg, $noreg + ; CHECK: $lr = MVE_DLSTP_16 killed renamable $r2 + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r3 + ; CHECK: $q1 = MVE_VORR killed $q0, killed $q0, 0, $noreg, undef $q1 + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv2123, align 2) + ; CHECK: renamable $r0, renamable $q2 = MVE_VLDRHU16_post killed renamable $r0, 16, 0, killed $noreg :: (load 16 from %ir.lsr.iv20, align 2) + ; CHECK: renamable $q2 = MVE_VADDi16 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 + ; CHECK: renamable $q0 = MVE_VADDi16 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1, $r3 + ; CHECK: renamable $vpr = MVE_VCTP16 killed renamable $r3, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu16no_acc killed renamable $q0, 0, $noreg + ; CHECK: $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r7, def $lr + ; CHECK: renamable $r0 = tUXTH killed renamable $r0, 14, $noreg + ; CHECK: tBX_RET 14, $noreg, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $lr + + tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 2, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + renamable $r0 = tUXTH killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 7, 14, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + renamable $r3 = t2BICri killed renamable $r3, 7, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 8, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r12, 27, 14, $noreg, $noreg + renamable $r3 = t2LSRri killed renamable $r12, 3, 14, $noreg, $noreg + renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 26, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg + $q1 = MVE_VORR killed $q0, $q0, 0, $noreg, undef $q1 + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv2123, align 2) + renamable $r0, renamable $q2 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv20, align 2) + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14, $noreg + renamable $q2 = MVE_VADDi16 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q0 = MVE_VADDi16 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1, $r3 + + renamable $vpr = MVE_VCTP16 killed renamable $r3, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu16no_acc killed renamable $q0, 0, $noreg + $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r7, def $lr + renamable $r0 = tUXTH killed renamable $r0, 14, $noreg + tBX_RET 14, $noreg, implicit killed $r0 + +... +--- +name: incorrect_vctp16_reduce +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: incorrect_vctp16_reduce + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 2, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: renamable $r0 = tUXTH killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 7, 14, $noreg + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 7, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 8, 14, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r12, 27, 14, $noreg, $noreg + ; CHECK: renamable $r3 = t2LSRri killed renamable $r12, 2, 14, $noreg, $noreg + ; CHECK: renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 26, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg + ; CHECK: $q1 = MVE_VORR killed $q0, killed $q0, 0, $noreg, undef $q1 + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv2123, align 2) + ; CHECK: renamable $r0, renamable $q2 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv20, align 2) + ; CHECK: renamable $q2 = MVE_VADDi16 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 + ; CHECK: renamable $q0 = MVE_VADDi16 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1, $r3 + ; CHECK: renamable $vpr = MVE_VCTP16 killed renamable $r3, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu16no_acc killed renamable $q0, 0, $noreg + ; CHECK: $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r7, def $lr + ; CHECK: renamable $r0 = tUXTH killed renamable $r0, 14, $noreg + ; CHECK: tBX_RET 14, $noreg, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $lr + + tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 2, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + renamable $r0 = tUXTH killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 7, 14, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + renamable $r3 = t2BICri killed renamable $r3, 7, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 8, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r12, 27, 14, $noreg, $noreg + renamable $r3 = t2LSRri killed renamable $r12, 2, 14, $noreg, $noreg + renamable $r3 = t2SUBrs renamable $r2, killed renamable $r3, 26, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg + $q1 = MVE_VORR killed $q0, $q0, 0, $noreg, undef $q1 + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv2123, align 2) + renamable $r0, renamable $q2 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv20, align 2) + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14, $noreg + renamable $q2 = MVE_VADDi16 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q0 = MVE_VADDi16 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1, $r3 + + renamable $vpr = MVE_VCTP16 killed renamable $r3, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu16no_acc killed renamable $q0, 0, $noreg + $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r7, def $lr + renamable $r0 = tUXTH killed renamable $r0, 14, $noreg + tBX_RET 14, $noreg, implicit killed $r0 + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s # A decent sized test to handle a matrix, with scalar and vector low-overhead loops. @@ -270,9 +270,7 @@ ; CHECK: renamable $r0 = t2BICri killed renamable $r0, 3, 14, $noreg, $noreg ; CHECK: renamable $r3 = t2LSLri $r10, 1, 14, $noreg, $noreg ; CHECK: renamable $r1, dead $cpsr = tSUBi3 killed renamable $r0, 4, 14, $noreg - ; CHECK: renamable $r0, dead $cpsr = tMOVi8 1, 14, $noreg ; CHECK: renamable $q0 = MVE_VDUP32 renamable $r7, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r0 = nuw nsw t2ADDrs killed renamable $r0, renamable $r1, 19, 14, $noreg, $noreg ; CHECK: renamable $r1, dead $cpsr = tLSRri killed renamable $r1, 2, 14, $noreg ; CHECK: renamable $r9 = t2SUBrs $r10, killed renamable $r1, 18, 14, $noreg, $noreg ; CHECK: bb.5.for.cond4.preheader.us: Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/multiple-do-loops.mir @@ -563,7 +563,6 @@ ; CHECK: early-clobber $sp = frame-setup t2STR_PRE killed $r8, $sp, -4, 14, $noreg ; CHECK: frame-setup CFI_INSTRUCTION offset $r8, -24 ; CHECK: renamable $r6, dead $cpsr = tMOVi8 0, 14, $noreg - ; CHECK: dead renamable $r12 = t2MOVi 1, 14, $noreg, $noreg ; CHECK: t2CMPrs killed renamable $r6, renamable $r3, 11, 14, $noreg, implicit-def $cpsr ; CHECK: tBcc %bb.3, 0, killed $cpsr ; CHECK: bb.1.vector.ph: @@ -801,7 +800,6 @@ ; CHECK: successors: %bb.6(0x30000000), %bb.4(0x50000000) ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: renamable $r6, dead $cpsr = tMOVi8 0, 14, $noreg - ; CHECK: dead renamable $r8 = t2MOVi 1, 14, $noreg, $noreg ; CHECK: t2CMPrs killed renamable $r6, renamable $r3, 11, 14, $noreg, implicit-def $cpsr ; CHECK: tBcc %bb.6, 0, killed $cpsr ; CHECK: bb.4.vector.ph66: Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir @@ -0,0 +1,183 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s +--- | + define dso_local i32 @no_vpsel_liveout(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { + entry: + %cmp9 = icmp eq i32 %N, 0 + %tmp = add i32 %N, 3 + %tmp1 = lshr i32 %tmp, 2 + %tmp2 = shl nuw i32 %tmp1, 2 + %tmp3 = add i32 %tmp2, -4 + %tmp4 = lshr i32 %tmp3, 2 + %tmp5 = add nuw nsw i32 %tmp4, 1 + br i1 %cmp9, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %tmp5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ] + %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] + %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>* + %lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>* + %tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7) + %tmp9 = sub i32 %tmp7, 4 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef) + %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef) + %tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32> + %tmp12 = mul nsw <4 x i32> %tmp11, %tmp10 + %tmp13 = add <4 x i32> %tmp12, %vec.phi + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4 + %tmp14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1) + %tmp15 = icmp ne i32 %tmp14, 0 + %lsr.iv.next = add nsw i32 %lsr.iv1, -1 + br i1 %tmp15, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %tmp16 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp13) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp16, %middle.block ] + ret i32 %res.0.lcssa + } + declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 + declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2 + declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 + declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 + +... +--- +name: no_vpsel_liveout +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: no_vpsel_liveout + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: dead $lr = t2DLS renamable $r12 + ; CHECK: $r3 = tMOVr killed $r12, 14, $noreg + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $q0, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + ; CHECK: $lr = tMOVr $r3, 14, $noreg + ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14, $noreg + ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0 + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $lr, $r7 + + tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $r12 + $r3 = tMOVr killed $r12, 14, $noreg + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $q0, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + $lr = tMOVr $r3, 14, $noreg + renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14, $noreg + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg + renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0 + + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir @@ -0,0 +1,199 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +# This example is actually equivalent as there's a sub in the loop, which is +# then used by the add in the exit - making the vctp operands equivalent. + +--- | + define dso_local i32 @wrong_vctp_liveout(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { + entry: + %cmp9 = icmp eq i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp9, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %5, %vector.ph ] + %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ] + %6 = phi i32 [ %N, %vector.ph ], [ %8, %vector.body ] + %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>* + %lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>* + %7 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %6) + %8 = sub i32 %6, 4 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %7, <4 x i16> undef) + %9 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %7, <4 x i16> undef) + %10 = sext <4 x i16> %wide.masked.load14 to <4 x i32> + %11 = mul nsw <4 x i32> %10, %9 + %12 = add <4 x i32> %11, %vec.phi + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4 + %13 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1) + %14 = icmp ne i32 %13, 0 + %lsr.iv.next = add nsw i32 %lsr.iv1, -1 + br i1 %14, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %15 = add i32 %8, 4 + %16 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %15) + %17 = select <4 x i1> %16, <4 x i32> %12, <4 x i32> %vec.phi + %18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %17) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %18, %middle.block ] + ret i32 %res.0.lcssa + } + declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) + declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + +... +--- +name: wrong_vctp_liveout +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: wrong_vctp_liveout + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg + ; CHECK: renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: dead $lr = t2DLS renamable $r12 + ; CHECK: $r3 = tMOVr killed $r12, 14, $noreg + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + ; CHECK: $lr = tMOVr $r3, 14, $noreg + ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14, $noreg + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg + ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1, $r2 + ; CHECK: renamable $r0, dead $cpsr = tADDi3 killed renamable $r2, 4, 14, $noreg + ; CHECK: renamable $vpr = MVE_VCTP32 killed renamable $r0, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $lr, $r7 + + tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg + renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 + renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $r12 + $r3 = tMOVr killed $r12, 14, $noreg + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $q1, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + $q0 = MVE_VORR killed $q1, $q1, 0, $noreg, undef $q0 + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + $lr = tMOVr $r3, 14, $noreg + renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14, $noreg + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg + renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1, $r2 + + renamable $r0, dead $cpsr = tADDi3 killed renamable $r2, 4, 14, $noreg + renamable $vpr = MVE_VCTP32 killed renamable $r0, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir @@ -0,0 +1,210 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +# I think this should be equivalent, but the calculation in the middle block +# is too complex to process for now. + +--- | + define dso_local i32 @wrong_vctp_liveout(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { + entry: + %cmp9 = icmp eq i32 %N, 0 + %tmp = add i32 %N, 3 + %tmp1 = lshr i32 %tmp, 2 + %tmp2 = shl nuw i32 %tmp1, 2 + %tmp3 = add i32 %tmp2, -4 + %tmp4 = lshr i32 %tmp3, 2 + %tmp5 = add nuw nsw i32 %tmp4, 1 + br i1 %cmp9, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %tmp5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ] + %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ] + %tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ] + %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>* + %lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>* + %tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7) + %tmp9 = sub i32 %tmp7, 4 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef) + %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef) + %tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32> + %tmp12 = mul nsw <4 x i32> %tmp11, %tmp10 + %tmp13 = add <4 x i32> %tmp12, %vec.phi + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4 + %tmp14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1) + %tmp15 = icmp ne i32 %tmp14, 0 + %lsr.iv.next = add nsw i32 %lsr.iv1, -1 + br i1 %tmp15, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %0 = add i32 %tmp9, 4 + %insert.idx = insertelement <4 x i32> undef, i32 %0, i32 0 + %idx.splat = shufflevector <4 x i32> %insert.idx, <4 x i32> undef, <4 x i32> zeroinitializer + %n.minusone = add i32 %N, -1 + %insert.n = insertelement <4 x i32> undef, i32 %n.minusone, i32 0 + %n.splat = shufflevector <4 x i32> %insert.n, <4 x i32> undef, <4 x i32> zeroinitializer + %tmp16 = icmp ult <4 x i32> %idx.splat, %n.splat + %tmp17 = select <4 x i1> %tmp16, <4 x i32> %tmp13, <4 x i32> %vec.phi + %tmp18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp17) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp18, %middle.block ] + ret i32 %res.0.lcssa + } + declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 + declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2 + declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 + declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 + +... +--- +name: wrong_vctp_liveout +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: wrong_vctp_liveout + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg + ; CHECK: renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: $r12 = tMOVr killed $r3, 14, $noreg + ; CHECK: $r3 = tMOVr $r2, 14, $noreg + ; CHECK: dead $lr = t2DLS renamable $r3 + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3, $r12 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + ; CHECK: $lr = tMOVr $r12, 14, $noreg + ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $r12 = nsw t2SUBri killed $r12, 1, 14, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1, $r2, $r3 + ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r2, 1, 14, $noreg + ; CHECK: renamable $q2 = MVE_VDUP32 killed renamable $r0, 0, $noreg, undef renamable $q2 + ; CHECK: renamable $r0, dead $cpsr = tADDi3 killed renamable $r3, 4, 14, $noreg + ; CHECK: renamable $vpr = MVE_VCMPu32r killed renamable $q2, killed renamable $r0, 8, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $lr, $r7 + + tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg + renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 + renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $r3 + $r12 = tMOVr killed $r3, 14, $noreg + $r3 = tMOVr $r2, 14, $noreg + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $q1, $r0, $r1, $r2, $r3, $r12 + + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + $q0 = MVE_VORR killed $q1, $q1, 0, $noreg, undef $q0 + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + $lr = tMOVr $r12, 14, $noreg + renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $r12 = nsw t2SUBri killed $r12, 1, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1, $r2, $r3 + + renamable $r0, dead $cpsr = tSUBi3 killed renamable $r2, 1, 14, $noreg + renamable $q2 = MVE_VDUP32 killed renamable $r0, 0, $noreg, undef renamable $q2 + renamable $r0, dead $cpsr = tADDi3 killed renamable $r3, 4, 14, $noreg + renamable $vpr = MVE_VCMPu32r killed renamable $q2, killed renamable $r0, 8, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir @@ -0,0 +1,194 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +# The VCTP uses r2, which is redefined in the loop. + +--- | + define dso_local i32 @wrong_vctp_liveout(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { + entry: + %cmp9 = icmp eq i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp9, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %5, %vector.ph ] + %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ] + %6 = phi i32 [ %N, %vector.ph ], [ %8, %vector.body ] + %lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>* + %lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>* + %7 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %6) + %8 = sub i32 %6, 4 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %7, <4 x i16> undef) + %9 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %7, <4 x i16> undef) + %10 = sext <4 x i16> %wide.masked.load14 to <4 x i32> + %11 = mul nsw <4 x i32> %10, %9 + %12 = add <4 x i32> %11, %vec.phi + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4 + %13 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1) + %14 = icmp ne i32 %13, 0 + %lsr.iv.next = add nsw i32 %lsr.iv1, -1 + br i1 %14, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %15 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %8) + %16 = select <4 x i1> %15, <4 x i32> %12, <4 x i32> %vec.phi + %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %16) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %17, %middle.block ] + ret i32 %res.0.lcssa + } + declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 + declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2 + declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 + declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 +... +--- +name: wrong_vctp_liveout +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: wrong_vctp_liveout + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg + ; CHECK: renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: dead $lr = t2DLS renamable $r12 + ; CHECK: $r3 = tMOVr killed $r12, 14, $noreg + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + ; CHECK: $lr = tMOVr $r3, 14, $noreg + ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14, $noreg + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg + ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1, $r2 + ; CHECK: renamable $vpr = MVE_VCTP32 killed renamable $r2, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $lr, $r7 + + tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg + renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 + renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $r12 + $r3 = tMOVr killed $r12, 14, $noreg + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $q1, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + $q0 = MVE_VORR killed $q1, $q1, 0, $noreg, undef $q0 + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) + renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + $lr = tMOVr $r3, 14, $noreg + renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14, $noreg + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg + renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1, $r2 + + renamable $vpr = MVE_VCTP32 killed renamable $r2, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + +...