Index: llvm/include/llvm/CodeGen/ReachingDefAnalysis.h =================================================================== --- llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -119,12 +119,9 @@ /// use or a live out. bool isRegUsedAfter(MachineInstr *MI, int PhysReg); - /// Provides the first instruction before MI that uses PhysReg - MachineInstr *getInstWithUseBefore(MachineInstr *MI, int PhysReg); - /// Provides all instructions before MI that uses PhysReg void getAllInstWithUseBefore(MachineInstr *MI, int PhysReg, - SmallVectorImpl &Uses); + SmallPtrSetImpl &Uses) const; /// Provides the clearance - the number of instructions since the closest /// reaching def instuction of PhysReg that reaches MI. @@ -133,7 +130,7 @@ /// Provides the uses, in the same block as MI, of register that MI defines. /// This does not consider live-outs. void getReachingLocalUses(MachineInstr *MI, int PhysReg, - SmallVectorImpl &Uses); + SmallPtrSetImpl &Uses); /// Provide the number of uses, in the same block as MI, of the register that /// MI defines. Index: llvm/lib/CodeGen/ReachingDefAnalysis.cpp =================================================================== --- llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -227,7 +227,7 @@ } void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg, - SmallVectorImpl &Uses) { + SmallPtrSetImpl &Uses) { MachineBasicBlock *MBB = Def->getParent(); MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def); while (++MI != MBB->end()) { @@ -240,7 +240,7 @@ if (!MO.isReg() || !MO.isUse() || MO.getReg() != PhysReg) continue; - Uses.push_back(&*MI); + Uses.insert(&*MI); if (MO.isKill()) return; } @@ -248,7 +248,7 @@ } unsigned ReachingDefAnalysis::getNumUses(MachineInstr *Def, int PhysReg) { - SmallVector Uses; + SmallPtrSet Uses; getReachingLocalUses(Def, PhysReg, Uses); return Uses.size(); } @@ -308,27 +308,13 @@ return Def < 0 ? nullptr : getInstFromId(MBB, Def); } -MachineInstr *ReachingDefAnalysis::getInstWithUseBefore(MachineInstr *MI, - int PhysReg) { - auto I = MachineBasicBlock::reverse_iterator(MI); - auto E = MI->getParent()->rend(); - I++; +void +ReachingDefAnalysis::getAllInstWithUseBefore(MachineInstr *MI, int PhysReg, + SmallPtrSetImpl &Uses) const { - for ( ; I != E; I++) + MachineBasicBlock *MBB = MI->getParent(); + for (auto I = MBB->begin(), E = MachineBasicBlock::iterator(MI); I != E; ++I) for (auto &MO : I->operands()) if (MO.isReg() && MO.isUse() && MO.getReg() == PhysReg) - return &*I; - - return nullptr; -} - -void ReachingDefAnalysis::getAllInstWithUseBefore(MachineInstr *MI, - int PhysReg, SmallVectorImpl &Uses) { - MachineInstr *Use = nullptr; - MachineInstr *Pos = MI; - - while ((Use = getInstWithUseBefore(Pos, PhysReg))) { - Uses.push_back(Use); - Pos = Use; - } + Uses.insert(&*I); } Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -491,23 +491,24 @@ // This table shows the VPT instruction variants, i.e. the different // mask field encodings, see also B5.6. Predication/conditional execution in // the ArmARM. -enum VPTMaskValue { - T = 8, // 0b1000 - TT = 4, // 0b0100 - TE = 12, // 0b1100 - TTT = 2, // 0b0010 - TTE = 6, // 0b0110 - TEE = 10, // 0b1010 - TET = 14, // 0b1110 - TTTT = 1, // 0b0001 - TTTE = 3, // 0b0011 - TTEE = 5, // 0b0101 - TTET = 7, // 0b0111 - TEEE = 9, // 0b1001 - TEET = 11, // 0b1011 - TETT = 13, // 0b1101 - TETE = 15 // 0b1111 -}; + + +inline static unsigned getARMVPTBlockMask(unsigned NumInsts) { + switch (NumInsts) { + case 1: + return ARMVCC::T; + case 2: + return ARMVCC::TT; + case 3: + return ARMVCC::TTT; + case 4: + return ARMVCC::TTTT; + default: + break; + }; + llvm_unreachable("Unexpected number of instruction in a VPT block"); +} + static inline bool isVPTOpcode(int Opc) { return Opc == ARM::MVE_VPTv16i8 || Opc == ARM::MVE_VPTv16u8 || @@ -595,6 +596,18 @@ return 0; } +static inline unsigned getTailPredVectorWidth(unsigned Opcode) { + switch (Opcode) { + default: + llvm_unreachable("unhandled vctp opcode"); + case ARM::MVE_VCTP8: return 16; + case ARM::MVE_VCTP16: return 8; + case ARM::MVE_VCTP32: return 4; + case ARM::MVE_VCTP64: return 2; + } + return 0; +} + static inline bool isVCTP(MachineInstr *MI) { switch (MI->getOpcode()) { @@ -642,6 +655,16 @@ Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD; } +static inline bool isSubImmOpcode(int Opc) { + return Opc == ARM::SUBri || + Opc == ARM::tSUBi3 || Opc == ARM::tSUBi8 || + Opc == ARM::t2SUBri || Opc == ARM::t2SUBri12 || Opc == ARM::t2SUBSri; +} + +static inline bool isMovRegOpcode(int Opc) { + return Opc == ARM::MOVr || Opc == ARM::tMOVr || Opc == ARM::t2MOVr; +} + /// isValidCoprocessorNumber - decide whether an explicit coprocessor /// number is legal in generic instructions like CDP. The answer can /// vary with the subtarget. Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -24,6 +24,7 @@ #include "ARMSubtarget.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineLoopUtils.h" @@ -100,9 +101,7 @@ public: PredicatedMI(MachineInstr *I, SetVector &Preds) : - MI(I) { - Predicates.insert(Preds.begin(), Preds.end()); - } + MI(I) { Predicates.insert(Preds.begin(), Preds.end()); } }; // Represent a VPT block, a list of instructions that begins with a VPST and @@ -165,6 +164,7 @@ VPTBlock *CurrentBlock = nullptr; SetVector CurrentPredicate; SmallVector VPTBlocks; + SmallPtrSet ToRemove; bool Revert = false; bool CannotTailPredicate = false; @@ -294,8 +294,6 @@ void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; - void RemoveLoopUpdate(LowOverheadLoop &LoLoop); - void ConvertVPTBlocks(LowOverheadLoop &LoLoop); MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); @@ -372,9 +370,66 @@ return true; } +static bool IsSafeToRemove(MachineInstr *MI, ReachingDefAnalysis *RDA, + SmallPtrSetImpl &ToRemove, + SmallPtrSetImpl &Ignore) { + auto getLocalUses = [&RDA](MachineInstr *MI, + SmallPtrSetImpl &Uses) { + MachineBasicBlock *MBB = MI->getParent(); + for (auto &MO : MI->operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + + // Collect the uses that each def touches within the block. + RDA->getReachingLocalUses(MI, MO.getReg(), Uses); + + // If the value is live-out, check if this block is a single-block loop, + // and if so, then check the users before MI. If we find that the value + // if live-out in any other block then we can't remove it. + if (auto *LiveOut = RDA->getLocalLiveOutMIDef(MI->getParent(), + MO.getReg())) { + if (LiveOut != MI) + continue; + for (auto SuccBB : MBB->successors()) { + if (!SuccBB->isLiveIn(MO.getReg())) + continue; + + if (SuccBB == MBB) + RDA->getAllInstWithUseBefore(MI, MO.getReg(), Uses); + else + return false; + } + } + } + return true; + }; + + // Unless told to ignore the instruction, don't remove anything which has + // side effects. + if (!Ignore.count(MI)) { + if (MI->mayLoadOrStore() || MI->hasUnmodeledSideEffects() || + MI->isBranch()) + return false; + } + + ToRemove.insert(MI); + + SmallPtrSet Uses; + if (!getLocalUses(MI, Uses)) + return false; + + for (auto I : Uses) { + if (Ignore.count(I) || ToRemove.count(I)) + continue; + if (!IsSafeToRemove(I, RDA, ToRemove, Ignore)) + return false; + } + return true; +} + bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt, - ReachingDefAnalysis *RDA, - MachineLoopInfo *MLI) { + ReachingDefAnalysis *RDA, + MachineLoopInfo *MLI) { // All predication within the loop should be based on vctp. If the block // isn't predicated on entry, check whether the vctp is within the block // and that all other instructions are then predicated on it. @@ -388,10 +443,9 @@ if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI)) continue; LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI - << " - which is predicated on:\n"; - for (auto *MI : PredMI.Predicates) - dbgs() << " - " << *MI; - ); + << " - which is predicated on:\n"; + for (auto *MI : PredMI.Predicates) + dbgs() << " - " << *MI); return false; } } @@ -413,17 +467,20 @@ // The element count register maybe defined after InsertPt, in which case we // need to try to move either InsertPt or the def so that the [w|d]lstp can // use the value. - MachineBasicBlock *InsertBB = InsertPt->getParent(); - if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) { + MachineBasicBlock *InsertBB = StartInsertPt->getParent(); + if (!RDA->isReachingDefLiveOut(StartInsertPt, NumElements)) { if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) { - if (IsSafeToMove(ElemDef, InsertPt, RDA)) { + if (IsSafeToMove( + ElemDef, StartInsertPt, RDA)) { ElemDef->removeFromParent(); - InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef); + InsertBB->insert(MachineBasicBlock::iterator(StartInsertPt), ElemDef); LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: " << *ElemDef); - } else if (IsSafeToMove(InsertPt, ElemDef, RDA)) { - InsertPt->removeFromParent(); - InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt); + } else if (IsSafeToMove( + StartInsertPt, ElemDef, RDA)) { + StartInsertPt->removeFromParent(); + InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), + StartInsertPt); LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); } else return false; @@ -458,7 +515,54 @@ MBB = *MBB->pred_begin(); } - LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n"); + // Check that the value change of the element count is what we expect and + // that the predication will be equivalent. For this we need: + // NumElements = NumElements - VectorWidth. The sub will be a sub immediate + // and we can also allow register copies within the chain too. + auto IsValidSub = [](MachineInstr *MI, unsigned VecWidth) { + unsigned ImmOpIdx = 0; + switch (MI->getOpcode()) { + default: + llvm_unreachable("unhandled sub opcode"); + case ARM::tSUBi3: + case ARM::tSUBi8: + ImmOpIdx = 3; + break; + case ARM::t2SUBri: + case ARM::t2SUBri12: + ImmOpIdx = 2; + break; + } + return MI->getOperand(ImmOpIdx).getImm() == VecWidth; + }; + + MBB = VCTP->getParent(); + if (MachineInstr *Def = RDA->getReachingMIDef(&MBB->back(), NumElements)) { + SmallPtrSet ElementChain; + SmallPtrSet Ignore = { VCTP }; + unsigned VectorWidth = getTailPredVectorWidth(VCTP->getOpcode()); + + if (IsSafeToRemove(Def, RDA, ElementChain, Ignore)) { + bool FoundSub = false; + + for (auto *MI : ElementChain) { + if (isMovRegOpcode(MI->getOpcode())) + continue; + + if (isSubImmOpcode(MI->getOpcode())) { + if (FoundSub || !IsValidSub(MI, VectorWidth)) + return false; + FoundSub = true; + } else + return false; + } + + LLVM_DEBUG(dbgs() << "ARM Loops: Will remove element count chain:\n"; + for (auto *MI : ElementChain) + dbgs() << " - " << *MI); + ToRemove.insert(ElementChain.begin(), ElementChain.end()); + } + } return true; } @@ -615,6 +719,8 @@ dbgs() << " - " << Preheader->getName() << "\n"; else if (auto *Preheader = MLI->findLoopPreheader(ML)) dbgs() << " - " << Preheader->getName() << "\n"; + else if (auto *Preheader = MLI->findLoopPreheader(ML, true)) + dbgs() << " - " << Preheader->getName() << "\n"; for (auto *MBB : ML->getBlocks()) dbgs() << " - " << MBB->getName() << "\n"; ); @@ -665,28 +771,6 @@ // Check we know how to tail predicate any mve instructions. LoLoop.AnalyseMVEInst(&MI); } - - // We need to ensure that LR is not used or defined inbetween LoopDec and - // LoopEnd. - if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert) - continue; - - // If we find that LR has been written or read between LoopDec and - // LoopEnd, expect that the decremented value is being used else where. - // Because this value isn't actually going to be produced until the - // latch, by LE, we would need to generate a real sub. The value is also - // likely to be copied/reloaded for use of LoopEnd - in which in case - // we'd need to perform an add because it gets subtracted again by LE! - // The other option is to then generate the other form of LE which doesn't - // perform the sub. - for (auto &MO : MI.operands()) { - if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() && - MO.getReg() == ARM::LR) { - LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI); - LoLoop.Revert = true; - break; - } - } } } @@ -694,6 +778,15 @@ if (!LoLoop.FoundAllComponents()) return false; + SmallPtrSet Ignore = { LoLoop.Dec, LoLoop.End }; + if (!IsSafeToRemove(LoLoop.Dec, RDA, LoLoop.ToRemove, Ignore)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove loop count chain.\n"); + LoLoop.Revert = true; + } else + LLVM_DEBUG(dbgs() << "ARM Loops: Will need to remove:\n"; + for (auto *I : LoLoop.ToRemove) + dbgs() << " - " << *I); + LoLoop.CheckLegality(BBUtils.get(), RDA, MLI); Expand(LoLoop); return true; @@ -819,8 +912,11 @@ Killed.push_back(Kill); } } - for (auto *MI : Dead) + LLVM_DEBUG(dbgs() << "ARM Loops: Erasing iteration count chain:\n"); + for (auto *MI : Dead) { + LLVM_DEBUG(dbgs() << " - " << *MI); MI->eraseFromParent(); + } } } @@ -832,70 +928,6 @@ return &*MIB; } -// Goal is to optimise and clean-up these loops: -// -// vector.body: -// renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg -// renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4 -// .. -// $lr = MVE_DLSTP_32 renamable $r3 -// -// The SUB is the old update of the loop iteration count expression, which -// is no longer needed. This sub is removed when the element count, which is in -// r3 in this example, is defined by an instruction in the loop, and it has -// no uses. -// -void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) { - Register ElemCount = LoLoop.VCTP->getOperand(1).getReg(); - MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back(); - - LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n"); - - if (LoLoop.ML->getNumBlocks() != 1) { - LLVM_DEBUG(dbgs() << "ARM Loops: single block loop expected\n"); - return; - } - - LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing MO: "; - LoLoop.VCTP->getOperand(1).dump()); - - // Find the definition we are interested in removing, if there is one. - MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount); - if (!Def) - return; - - // Bail if we define CPSR and it is not dead - if (!Def->registerDefIsDead(ARM::CPSR, TRI)) { - LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n"); - return; - } - - // Bail if elemcount is used in exit blocks, i.e. if it is live-in. - if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n"); - return; - } - - // Bail if there are uses after this Def in the block. - SmallVector Uses; - RDA->getReachingLocalUses(Def, ElemCount, Uses); - if (Uses.size()) { - LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n"); - return; - } - - Uses.clear(); - RDA->getAllInstWithUseBefore(Def, ElemCount, Uses); - - // Remove Def if there are no uses, or if the only use is the VCTP - // instruction. - if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: "; - Def->dump()); - Def->eraseFromParent(); - } -} - void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { auto RemovePredicate = [](MachineInstr *MI) { LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); @@ -968,7 +1000,6 @@ RemovePredicate(PredMI.MI); } } - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP); LoLoop.VCTP->eraseFromParent(); } @@ -987,9 +1018,7 @@ MIB.add(End->getOperand(0)); MIB.add(End->getOperand(1)); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); - - LoLoop.End->eraseFromParent(); - LoLoop.Dec->eraseFromParent(); + End->eraseFromParent(); return &*MIB; }; @@ -1022,9 +1051,11 @@ RemoveDeadBranch(LoLoop.Start); LoLoop.End = ExpandLoopEnd(LoLoop); RemoveDeadBranch(LoLoop.End); - if (LoLoop.IsTailPredicationLegal()) { - RemoveLoopUpdate(LoLoop); + if (LoLoop.IsTailPredicationLegal()) ConvertVPTBlocks(LoLoop); + for (auto *I : LoLoop.ToRemove) { + LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); + I->eraseFromParent(); } } Index: llvm/lib/Target/ARM/Utils/ARMBaseInfo.h =================================================================== --- llvm/lib/Target/ARM/Utils/ARMBaseInfo.h +++ llvm/lib/Target/ARM/Utils/ARMBaseInfo.h @@ -111,22 +111,6 @@ }; } -inline static unsigned getARMVPTBlockMask(unsigned NumInsts) { - switch (NumInsts) { - case 1: - return ARMVCC::T; - case 2: - return ARMVCC::TT; - case 3: - return ARMVCC::TTT; - case 4: - return ARMVCC::TTTT; - default: - break; - }; - llvm_unreachable("Unexpected number of instruction in a VPT block"); -} - inline static const char *ARMVPTPredToString(ARMVCC::VPTCodes CC) { switch (CC) { case ARMVCC::None: return "none"; Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir @@ -1,19 +1,17 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s -# CHECK-NOT: $lr = tMOVr $r0, 13 -# CHECK: $lr = t2DLS killed $r0 -# CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 --- | target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main" - + define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) { entry: %scevgep = getelementptr i32, i32* %q, i32 -1 %scevgep3 = getelementptr i32, i32* %p, i32 -1 call void @llvm.set.loop.iterations.i32(i32 %n) br label %while.body - + while.body: ; preds = %while.body, %entry %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ] %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ] @@ -27,14 +25,14 @@ %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) %3 = icmp ne i32 %2, 0 br i1 %3, label %while.body, label %while.end - + while.end: ; preds = %while.body ret i32 0 } - + declare void @llvm.set.loop.iterations.i32(i32) #0 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 - + attributes #0 = { noduplicate nounwind } attributes #1 = { nounwind } @@ -75,20 +73,40 @@ restorePoint: '' fixedStack: [] stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: do_copy + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $r1, $r2, $r0, $r7, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: $lr = t2DLS killed $r0 + ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg + ; CHECK: renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg + ; CHECK: bb.1.while.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1 + ; CHECK: renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6) + ; CHECK: early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2) + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.while.end: + ; CHECK: $r0, dead $cpsr = tMOVi8 0, 14, $noreg + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 bb.0.entry: successors: %bb.1(0x80000000) liveins: $r0, $r1, $r2, $r7, $lr - + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -97,17 +115,17 @@ t2DoLoopStart killed $r0 renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg - + bb.1.while.body: successors: %bb.1(0x7c000000), %bb.2(0x04000000) liveins: $lr, $r0, $r1 - + renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6) early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2) renamable $lr = t2LoopDec killed renamable $lr, 1 t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr tB %bb.2, 14, $noreg - + bb.2.while.end: $r0, dead $cpsr = tMOVi8 0, 14, $noreg tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir @@ -1,12 +1,11 @@ # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s -# There are 2 SUBS, and the 2nd one is identified as the def. -# Thus, the 1st is a use, and we shouldn't optimise away the SUBS. +# There are 2 SUBS, so don't use tail predication # CHECK: bb.1.vector.body: # CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg # CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg -# CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 +# CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 --- | target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir @@ -167,6 +167,7 @@ tB %bb.2, 14, $noreg bb.2.for.cond.cleanup: + liveins: $cpsr tPOP_RET 14, $noreg, def $r7, def $pc ... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/end-positive-offset.mir @@ -1,19 +1,10 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s -# CHECK-NOT: DoLoopStart -# CHECK-NOT: DLS -# CHECK: bb.1.for.body: -# CHECK: $lr = t2SUBri killed renamable $lr, 1, 14, $noreg, def $cpsr -# CHECK-NOT: t2CMPri $lr -# CHECK: tBcc %bb.3, 1, killed $cpsr -# CHECK: tB %bb.2, 14, $noreg -# CHECK: bb.2.for.cond.cleanup: -# CHECK: bb.3.for.header: - --- | target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main" - + define void @size_limit(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: call void @llvm.set.loop.iterations.i32(i32 %N) @@ -21,7 +12,7 @@ %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 br label %for.header - + for.body: ; preds = %for.header %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1 %ld1 = load i32, i32* %scevgep11, align 4 @@ -36,10 +27,10 @@ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1) %cmp = icmp ne i32 %count.next, 0 br i1 %cmp, label %for.header, label %for.cond.cleanup - + for.cond.cleanup: ; preds = %for.body ret void - + for.header: ; preds = %for.body, %entry %lsr.iv9 = phi i32* [ %scevgep8, %entry ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %entry ], [ %scevgep6, %for.body ] @@ -47,16 +38,16 @@ %count = phi i32 [ %N, %entry ], [ %count.next, %for.body ] br label %for.body } - + ; Function Attrs: nounwind declare i32 @llvm.arm.space(i32 immarg, i32) #0 - + ; Function Attrs: noduplicate nounwind declare void @llvm.set.loop.iterations.i32(i32) #1 - + ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 - + attributes #0 = { nounwind } attributes #1 = { noduplicate nounwind } @@ -98,44 +89,95 @@ restorePoint: '' fixedStack: [] stack: - - { id: 0, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, + - { id: 0, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, + - { id: 1, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 2, name: '', type: spill-slot, offset: -20, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, + - { id: 2, name: '', type: spill-slot, offset: -20, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 3, name: '', type: spill-slot, offset: -24, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, + - { id: 3, name: '', type: spill-slot, offset: -24, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 4, name: '', type: spill-slot, offset: -28, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, + - { id: 4, name: '', type: spill-slot, offset: -28, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 5, name: '', type: spill-slot, offset: -32, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, + - { id: 5, name: '', type: spill-slot, offset: -32, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 6, name: '', type: spill-slot, offset: -36, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, + - { id: 6, name: '', type: spill-slot, offset: -36, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 7, name: '', type: spill-slot, offset: -40, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, + - { id: 7, name: '', type: spill-slot, offset: -40, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 8, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + - { id: 8, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 9, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + - { id: 9, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: size_limit + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: liveins: $r3, $r0, $r2, $r1, $r7, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: $sp = frame-setup tSUBspi $sp, 8, 14, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 40 + ; CHECK: renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg + ; CHECK: tSTRspi killed $r1, $sp, 7, 14, $noreg :: (store 4 into %stack.0) + ; CHECK: tSTRspi killed $r2, $sp, 6, 14, $noreg :: (store 4 into %stack.1) + ; CHECK: tSTRspi killed $r0, $sp, 5, 14, $noreg :: (store 4 into %stack.2) + ; CHECK: tSTRspi killed $r3, $sp, 4, 14, $noreg :: (store 4 into %stack.3) + ; CHECK: tB %bb.3, 14, $noreg + ; CHECK: bb.1.for.body: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: $r0 = tLDRspi $sp, 3, 14, $noreg :: (load 4 from %stack.4) + ; CHECK: renamable $r1, renamable $r0 = t2LDR_PRE killed renamable $r0, 4, 14, $noreg :: (load 4 from %ir.scevgep11) + ; CHECK: $r2 = tLDRspi $sp, 2, 14, $noreg :: (load 4 from %stack.5) + ; CHECK: renamable $r3, renamable $r2 = t2LDR_PRE killed renamable $r2, 4, 14, $noreg :: (load 4 from %ir.scevgep7) + ; CHECK: renamable $r1, dead $cpsr = nsw tMUL killed renamable $r3, killed renamable $r1, 14, $noreg + ; CHECK: $r3 = tLDRspi $sp, 1, 14, $noreg :: (load 4 from %stack.6) + ; CHECK: early-clobber renamable $r3 = t2STR_PRE killed renamable $r1, killed renamable $r3, 4, 14, $noreg :: (store 4 into %ir.scevgep3) + ; CHECK: $r1 = tLDRspi $sp, 0, 14, $noreg :: (load 4 from %stack.7) + ; CHECK: $lr = tMOVr killed $r1, 14, $noreg + ; CHECK: $lr = t2SUBri killed renamable $lr, 1, 14, $noreg, def $cpsr + ; CHECK: $r12 = tMOVr killed $lr, 14, $noreg + ; CHECK: tSTRspi killed $r0, $sp, 7, 14, $noreg :: (store 4 into %stack.0) + ; CHECK: tSTRspi killed $r2, $sp, 6, 14, $noreg :: (store 4 into %stack.1) + ; CHECK: tSTRspi killed $r3, $sp, 5, 14, $noreg :: (store 4 into %stack.2) + ; CHECK: t2STRi12 killed $r12, $sp, 16, 14, $noreg :: (store 4 into %stack.3) + ; CHECK: tBcc %bb.3, 1, killed $cpsr + ; CHECK: tB %bb.2, 14, $noreg + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: $sp = tADDspi $sp, 8, 14, $noreg + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc + ; CHECK: bb.3.for.header: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: $r0 = tLDRspi $sp, 4, 14, $noreg :: (load 4 from %stack.3) + ; CHECK: $r1 = tLDRspi $sp, 5, 14, $noreg :: (load 4 from %stack.2) + ; CHECK: $r2 = tLDRspi $sp, 6, 14, $noreg :: (load 4 from %stack.1) + ; CHECK: $r3 = tLDRspi $sp, 7, 14, $noreg :: (load 4 from %stack.0) + ; CHECK: tSTRspi killed $r0, $sp, 0, 14, $noreg :: (store 4 into %stack.7) + ; CHECK: tSTRspi killed $r1, $sp, 1, 14, $noreg :: (store 4 into %stack.6) + ; CHECK: tSTRspi killed $r2, $sp, 2, 14, $noreg :: (store 4 into %stack.5) + ; CHECK: tSTRspi killed $r3, $sp, 3, 14, $noreg :: (store 4 into %stack.4) + ; CHECK: tB %bb.1, 14, $noreg bb.0.entry: successors: %bb.3(0x80000000) liveins: $r0, $r1, $r2, $r3, $r7, $lr - + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -151,10 +193,10 @@ tSTRspi killed $r0, $sp, 5, 14, $noreg :: (store 4 into %stack.2) tSTRspi killed $r3, $sp, 4, 14, $noreg :: (store 4 into %stack.3) tB %bb.3, 14, $noreg - + bb.1.for.body: successors: %bb.3(0x40000000), %bb.2(0x40000000) - + $r0 = tLDRspi $sp, 3, 14, $noreg :: (load 4 from %stack.4) renamable $r1, renamable $r0 = t2LDR_PRE renamable $r0, 4, 14, $noreg :: (load 4 from %ir.scevgep11) $r2 = tLDRspi $sp, 2, 14, $noreg :: (load 4 from %stack.5) @@ -172,14 +214,14 @@ t2STRi12 killed $r12, $sp, 16, 14, $noreg :: (store 4 into %stack.3) t2LoopEnd killed renamable $lr, %bb.3, implicit-def dead $cpsr tB %bb.2, 14, $noreg - + bb.2.for.cond.cleanup: $sp = tADDspi $sp, 8, 14, $noreg tPOP_RET 14, $noreg, def $r7, def $pc - + bb.3.for.header: successors: %bb.1(0x80000000) - + $r0 = tLDRspi $sp, 4, 14, $noreg :: (load 4 from %stack.3) $r1 = tLDRspi $sp, 5, 14, $noreg :: (load 4 from %stack.2) $r2 = tLDRspi $sp, 6, 14, $noreg :: (load 4 from %stack.1) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir @@ -0,0 +1,165 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s +--- | + define dso_local void @incorrect_sub_16(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { + entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i16* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i16* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv13 = bitcast i16* %lsr.iv to <8 x i16>* + %lsr.iv1416 = bitcast i16* %lsr.iv14 to <8 x i16>* + %lsr.iv1719 = bitcast i16* %lsr.iv17 to <8 x i16>* + %8 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7) + %9 = sub i32 %7, 7 + %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv13, i32 4, <8 x i1> %8, <8 x i16> undef) + %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv1416, i32 4, <8 x i1> %8, <8 x i16> undef) + %10 = add nsw <8 x i16> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %10, <8 x i16>* %lsr.iv1719, i32 4, <8 x i1> %8) + %scevgep = getelementptr i16, i16* %lsr.iv, i32 8 + %scevgep15 = getelementptr i16, i16* %lsr.iv14, i32 8 + %scevgep18 = getelementptr i16, i16* %lsr.iv17, i32 8 + %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) + %12 = icmp ne i32 %11, 0 + br i1 %12, label %vector.body, label %for.cond.cleanup + + for.cond.cleanup: ; preds = %vector.body, %entry + ret void + } + declare void @llvm.set.loop.iterations.i32(i32) + declare <8 x i1> @llvm.arm.mve.vctp16(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) + declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) +... +--- +name: incorrect_sub_16 +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: incorrect_sub_16 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $r3, $r1, $r0, $r2, $r7, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11, killed $cpsr, def dead $r7, def $pc, implicit killed $itstate + ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r1, $r0, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRHU16_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 7, 14, $noreg + ; CHECK: renamable $q0 = nsw MVE_VADDi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0 = MVE_VSTRHU16_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + renamable $lr = t2MOVi 1, 14, $noreg, $noreg + renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + renamable $r2, renamable $q1 = MVE_VLDRHU16_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 7, 14, $noreg + renamable $q0 = nsw MVE_VADDi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + MVE_VPST 8, implicit $vpr + renamable $r0 = MVE_VSTRHU16_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.for.cond.cleanup: + tPOP_RET 14, $noreg, def $r7, def $pc + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s # Local use after def, this mov is using r3: @@ -6,15 +7,8 @@ # # We should optimise away the SUB -# CHECK: bb.1.vector.body: -# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg -# CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 - --- | - target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" - target triple = "thumbv8.1m.main-arm-unknown-eabi" - - define dso_local void @local_use_after_def(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { + define dso_local void @incorrect_sub_32(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 @@ -38,53 +32,31 @@ %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* - %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7) - %9 = sub i32 %7, 4 - %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 - %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %9 = sub i32 %7, 5 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef) %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8) %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) %12 = icmp ne i32 %11, 0 - br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 + br i1 %12, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %vector.body, %entry ret void } - declare void @llvm.set.loop.iterations.i32(i32) #1 - declare <4 x i1> @llvm.arm.vctp32(i32) #2 - declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 - declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 - declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4 - declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3 - declare void @llvm.stackprotector(i8*, i8**) #5 - - attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" } - attributes #1 = { noduplicate nounwind } - attributes #2 = { nounwind readnone } - attributes #3 = { argmemonly nounwind willreturn } - attributes #4 = { argmemonly nounwind readonly willreturn } - attributes #5 = { nounwind } - - !llvm.module.flags = !{!0, !1} - !llvm.ident = !{!2} - - !0 = !{i32 1, !"wchar_size", i32 4} - !1 = !{i32 1, !"min_enum_size", i32 4} - !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"} - !3 = !{!4, !4, i64 0} - !4 = !{!"int", !5, i64 0} - !5 = !{!"omnipotent char", !6, i64 0} - !6 = !{!"Simple C++ TBAA"} - !7 = distinct !{!7, !8} - !8 = !{!"llvm.loop.isvectorized", i32 1} + declare void @llvm.set.loop.iterations.i32(i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) ... --- -name: local_use_after_def +name: incorrect_sub_32 alignment: 2 exposesReturnsTwice: false legalized: false @@ -130,16 +102,45 @@ constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: incorrect_sub_32 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $r3, $r1, $r0, $r2, $r7, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11, killed $cpsr, def dead $r7, def $pc, implicit killed $itstate + ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r1, $r0, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 5, 14, $noreg + ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) - liveins: $r0, $r1, $r2, $r3, $lr + liveins: $r0, $r1, $r2, $r3, $r7, $lr - frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 - $r7 = frame-setup tMOVr $sp, 14, $noreg - frame-setup CFI_INSTRUCTION def_cfa_register $r7 tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate @@ -156,14 +157,13 @@ renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg MVE_VPST 4, implicit $vpr - renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3) - renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3) - renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 5, 14, $noreg renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 MVE_VPST 8, implicit $vpr - renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3) + renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) renamable $lr = t2LoopDec killed renamable $lr, 1 - $r2 = tMOVr killed $r3, 14, $noreg t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr tB %bb.2, 14, $noreg Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir @@ -0,0 +1,166 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s +--- | + define dso_local void @incorrect_sub_8(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i8* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { + entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i8* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i8* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv13 = bitcast i8* %lsr.iv to <16 x i8>* + %lsr.iv1416 = bitcast i8* %lsr.iv14 to <16 x i8>* + %lsr.iv1719 = bitcast i8* %lsr.iv17 to <16 x i8>* + %8 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %7) + %9 = sub i32 %7, 15 + %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv13, i32 4, <16 x i1> %8, <16 x i8> undef) + %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv1416, i32 4, <16 x i1> %8, <16 x i8> undef) + %10 = add nsw <16 x i8> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %10, <16 x i8>* %lsr.iv1719, i32 4, <16 x i1> %8) + %scevgep = getelementptr i8, i8* %lsr.iv, i32 16 + %scevgep15 = getelementptr i8, i8* %lsr.iv14, i32 16 + %scevgep18 = getelementptr i8, i8* %lsr.iv17, i32 16 + %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) + %12 = icmp ne i32 %11, 0 + br i1 %12, label %vector.body, label %for.cond.cleanup + + for.cond.cleanup: ; preds = %vector.body, %entry + ret void + } + declare void @llvm.set.loop.iterations.i32(i32) + declare <16 x i1> @llvm.arm.mve.vctp8(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) + declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) + declare void @llvm.stackprotector(i8*, i8**) +... +--- +name: incorrect_sub_8 +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: incorrect_sub_8 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $r3, $r1, $r0, $r2, $r7, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11, killed $cpsr, def dead $r7, def $pc, implicit killed $itstate + ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r1, $r0, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRBU8_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 15, 14, $noreg + ; CHECK: renamable $q0 = nsw MVE_VADDi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0 = MVE_VSTRBU8_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + renamable $lr = t2MOVi 1, 14, $noreg, $noreg + renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + renamable $r2, renamable $q1 = MVE_VLDRBU8_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 15, 14, $noreg + renamable $q0 = nsw MVE_VADDi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + MVE_VPST 8, implicit $vpr + renamable $r0 = MVE_VSTRBU8_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4) + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.for.cond.cleanup: + tPOP_RET 14, $noreg, def $r7, def $pc + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dls.mir @@ -1,19 +1,17 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s -# CHECK: $lr = t2DLS killed $r0 -# CHECK-NOT: $lr = tMOVr $r0 -# CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 --- | target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main" - + define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) { entry: %scevgep = getelementptr i32, i32* %q, i32 -1 %scevgep3 = getelementptr i32, i32* %p, i32 -1 call void @llvm.set.loop.iterations.i32(i32 %n) br label %while.body - + while.body: ; preds = %while.body, %entry %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ] %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ] @@ -27,14 +25,14 @@ %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) %3 = icmp ne i32 %2, 0 br i1 %3, label %while.body, label %while.end - + while.end: ; preds = %while.body ret i32 0 } - + declare void @llvm.set.loop.iterations.i32(i32) #0 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 - + attributes #0 = { noduplicate nounwind } attributes #1 = { nounwind } @@ -75,20 +73,40 @@ restorePoint: '' fixedStack: [] stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: do_copy + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $r1, $r2, $r0, $r7, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: $lr = t2DLS killed $r0 + ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg + ; CHECK: renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg + ; CHECK: bb.1.while.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1 + ; CHECK: renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6) + ; CHECK: early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2) + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.while.end: + ; CHECK: $r0, dead $cpsr = tMOVi8 0, 14, $noreg + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 bb.0.entry: successors: %bb.1(0x80000000) liveins: $r0, $r1, $r2, $r7, $lr - + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -97,17 +115,17 @@ $lr = tMOVr killed $r0, 14, $noreg renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg - + bb.1.while.body: successors: %bb.1(0x7c000000), %bb.2(0x04000000) liveins: $lr, $r0, $r1 - + renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6) early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2) renamable $lr = t2LoopDec killed renamable $lr, 1 t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr tB %bb.2, 14, $noreg - + bb.2.while.end: $r0, dead $cpsr = tMOVi8 0, 14, $noreg tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir @@ -111,15 +111,14 @@ ; CHECK: tPOP_RET 0, killed $cpsr, def dead $r4, def $pc, implicit killed $itstate ; CHECK: renamable $r12 = t2LSRri killed renamable $r3, 1, 14, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg - ; CHECK: $lr = MVE_DLSTP_32 renamable $r12 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12 ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $lr, $r0, $r12, $r2, $r3, $r1 + ; CHECK: liveins: $lr, $r0, $r2, $r3, $r1 ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg ; CHECK: renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1) ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg ; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg ; CHECK: renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1) ; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir @@ -118,16 +118,15 @@ ; CHECK: tPOP_RET 0, killed $cpsr, def dead $r4, def $pc, implicit killed $itstate ; CHECK: $r12 = t2MOVr killed $r3, 14, $noreg, $noreg ; CHECK: renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg - ; CHECK: $lr = MVE_DLSTP_32 renamable $r12 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12 ; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $lr, $r0, $r12, $r2, $r3, $r1 + ; CHECK: liveins: $lr, $r0, $r2, $r3, $r1 ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg ; CHECK: renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1) ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg ; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg ; CHECK: renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1) ; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir @@ -117,15 +117,14 @@ ; CHECK: $r12 = t2MOVr killed $r3, 14, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg ; CHECK: renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg - ; CHECK: $lr = MVE_DLSTP_32 renamable $r12 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12 ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $lr, $r0, $r12, $r2, $r3, $r1 + ; CHECK: liveins: $lr, $r0, $r2, $r3, $r1 ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg ; CHECK: renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1) ; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg ; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg ; CHECK: renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1) ; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -388,7 +388,6 @@ ; CHECK-NEXT: vldrb.u32 q1, [r5] ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB5_5 @@ -593,7 +592,6 @@ ; CHECK-NEXT: vldrh.s32 q0, [r0], #8 ; CHECK-NEXT: vldrh.s32 q1, [r1], #8 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB6_1 @@ -685,7 +683,6 @@ ; CHECK-NEXT: vldrb.u32 q1, [r5] ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB7_5 @@ -890,7 +887,6 @@ ; CHECK-NEXT: vldrh.u32 q0, [r0], #8 ; CHECK-NEXT: vldrh.u32 q1, [r1], #8 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB8_1 @@ -980,7 +976,6 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB9_5 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/remove-elem-moves.mir @@ -0,0 +1,323 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s + +--- | + define dso_local arm_aapcs_vfpcc void @remove_mov_lr_chain(float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %blockSize) #0 { + entry: + %cmp5 = icmp eq i32 %blockSize, 0 + br i1 %cmp5, label %while.end, label %while.body.preheader + + while.body.preheader: ; preds = %entry + %min.iters.check = icmp ult i32 %blockSize, 4 + br i1 %min.iters.check, label %while.body.preheader19, label %vector.memcheck + + vector.memcheck: ; preds = %while.body.preheader + %scevgep = getelementptr float, float* %pDst, i32 %blockSize + %scevgep12 = getelementptr float, float* %pSrc, i32 %blockSize + %bound0 = icmp ugt float* %scevgep12, %pDst + %bound1 = icmp ugt float* %scevgep, %pSrc + %found.conflict = and i1 %bound0, %bound1 + %0 = lshr i32 %blockSize, 2 + %1 = shl nuw i32 %0, 2 + %2 = add i32 %1, -4 + %3 = lshr i32 %2, 2 + %4 = add nuw nsw i32 %3, 1 + br i1 %found.conflict, label %while.body.preheader19, label %vector.ph + + vector.ph: ; preds = %vector.memcheck + %n.vec = and i32 %blockSize, -4 + %ind.end = sub i32 %blockSize, %n.vec + %ind.end15 = getelementptr float, float* %pSrc, i32 %n.vec + %ind.end17 = getelementptr float, float* %pDst, i32 %n.vec + %scevgep9 = getelementptr float, float* %pDst, i32 -4 + %scevgep14 = getelementptr float, float* %pSrc, i32 -4 + call void @llvm.set.loop.iterations.i32(i32 %4) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv15 = phi float* [ %scevgep16, %vector.body ], [ %scevgep14, %vector.ph ] + %lsr.iv10 = phi float* [ %scevgep11, %vector.body ], [ %scevgep9, %vector.ph ] + %5 = phi i32 [ %4, %vector.ph ], [ %7, %vector.body ] + %lsr.iv1517 = bitcast float* %lsr.iv15 to <4 x float>* + %lsr.iv1012 = bitcast float* %lsr.iv10 to <4 x float>* + %scevgep18 = getelementptr <4 x float>, <4 x float>* %lsr.iv1517, i32 1 + %wide.load = load <4 x float>, <4 x float>* %scevgep18, align 4 + %6 = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %wide.load) + %scevgep13 = getelementptr <4 x float>, <4 x float>* %lsr.iv1012, i32 1 + store <4 x float> %6, <4 x float>* %scevgep13, align 4 + %scevgep11 = getelementptr float, float* %lsr.iv10, i32 4 + %scevgep16 = getelementptr float, float* %lsr.iv15, i32 4 + %7 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %5, i32 1) + %8 = icmp ne i32 %7, 0 + br i1 %8, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %cmp.n = icmp eq i32 %n.vec, %blockSize + br i1 %cmp.n, label %while.end, label %while.body.preheader19 + + while.body.preheader19: ; preds = %middle.block, %vector.memcheck, %while.body.preheader + %blkCnt.08.ph = phi i32 [ %blockSize, %vector.memcheck ], [ %blockSize, %while.body.preheader ], [ %ind.end, %middle.block ] + %pSrc.addr.07.ph = phi float* [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end15, %middle.block ] + %pDst.addr.06.ph = phi float* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end17, %middle.block ] + %scevgep1 = getelementptr float, float* %pSrc.addr.07.ph, i32 -1 + %scevgep4 = getelementptr float, float* %pDst.addr.06.ph, i32 -1 + call void @llvm.set.loop.iterations.i32(i32 %blkCnt.08.ph) + br label %while.body + + while.body: ; preds = %while.body, %while.body.preheader19 + %lsr.iv5 = phi float* [ %scevgep6, %while.body ], [ %scevgep4, %while.body.preheader19 ] + %lsr.iv = phi float* [ %scevgep2, %while.body ], [ %scevgep1, %while.body.preheader19 ] + %9 = phi i32 [ %blkCnt.08.ph, %while.body.preheader19 ], [ %12, %while.body ] + %scevgep3 = getelementptr float, float* %lsr.iv, i32 1 + %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1 + %10 = load float, float* %scevgep3, align 4 + %11 = tail call fast float @llvm.fabs.f32(float %10) + store float %11, float* %scevgep7, align 4 + %scevgep2 = getelementptr float, float* %lsr.iv, i32 1 + %scevgep6 = getelementptr float, float* %lsr.iv5, i32 1 + %12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1) + %13 = icmp ne i32 %12, 0 + br i1 %13, label %while.body, label %while.end + + while.end: ; preds = %while.body, %middle.block, %entry + ret void + } + declare float @llvm.fabs.f32(float) + declare <4 x float> @llvm.fabs.v4f32(<4 x float>) + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + +... +--- +name: remove_mov_lr_chain +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 16 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: remove_mov_lr_chain + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.9(0x30000000), %bb.1(0x50000000) + ; CHECK: liveins: $r0, $r1, $r2, $r4, $r5, $r7, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 16 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -12 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -16 + ; CHECK: tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: tBcc %bb.9, 0, killed $cpsr + ; CHECK: bb.1.while.body.preheader: + ; CHECK: successors: %bb.6(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: tCMPi8 renamable $r2, 4, 14, $noreg, implicit-def $cpsr + ; CHECK: tBcc %bb.6, 3, killed $cpsr + ; CHECK: bb.2.vector.memcheck: + ; CHECK: successors: %bb.3(0x40000000), %bb.6(0x40000000) + ; CHECK: liveins: $r2, $r0, $r1 + ; CHECK: renamable $r3 = t2ADDrs renamable $r0, renamable $r2, 18, 14, $noreg, $noreg + ; CHECK: tCMPr killed renamable $r3, renamable $r1, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 8, 4, implicit-def $itstate + ; CHECK: renamable $r3 = t2ADDrs renamable $r1, renamable $r2, 18, 8, $cpsr, $noreg, implicit $itstate + ; CHECK: tCMPr killed renamable $r3, renamable $r0, 8, killed $cpsr, implicit-def $cpsr, implicit killed $itstate + ; CHECK: tBcc %bb.6, 8, killed $cpsr + ; CHECK: bb.3.vector.ph: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: liveins: $r2, $r0, $r1 + ; CHECK: renamable $r4 = t2BICri renamable $r2, 3, 14, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: renamable $r12 = t2SUBri renamable $r4, 4, 14, $noreg, $noreg + ; CHECK: renamable $r7, dead $cpsr = tSUBrr renamable $r2, renamable $r4, 14, $noreg + ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2ADDrs renamable $r0, renamable $r4, 18, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS renamable $r3 + ; CHECK: renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 16, 14, $noreg + ; CHECK: dead $r5 = tMOVr killed $r3, 14, $noreg + ; CHECK: renamable $r3 = t2ADDrs renamable $r1, renamable $r4, 18, 14, $noreg, $noreg + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 16, 14, $noreg + ; CHECK: bb.4.vector.body: + ; CHECK: successors: %bb.4(0x7c000000), %bb.5(0x04000000) + ; CHECK: liveins: $r7, $r12, $r3, $r4, $r2, $lr, $r1, $r0 + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_pre killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.scevgep18, align 4) + ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VABSf32 killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r1 = MVE_VSTRBU8_pre killed renamable $q0, killed renamable $r1, 16, 0, $noreg :: (store 16 into %ir.scevgep13, align 4) + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.4 + ; CHECK: bb.5.middle.block: + ; CHECK: successors: %bb.7(0x80000000) + ; CHECK: liveins: $r7, $r12, $r3, $r4, $r2 + ; CHECK: tCMPr killed renamable $r4, killed renamable $r2, 14, $noreg, implicit-def $cpsr + ; CHECK: $lr = tMOVr killed $r7, 14, $noreg + ; CHECK: t2IT 0, 8, implicit-def $itstate + ; CHECK: tPOP_RET 0, killed $cpsr, def dead $r4, def dead $r5, def dead $r7, def $pc, implicit killed $itstate + ; CHECK: tB %bb.7, 14, $noreg + ; CHECK: bb.6: + ; CHECK: successors: %bb.7(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: $lr = tMOVr killed $r2, 14, $noreg + ; CHECK: $r12 = tMOVr killed $r0, 14, $noreg + ; CHECK: $r3 = tMOVr killed $r1, 14, $noreg + ; CHECK: bb.7.while.body.preheader19: + ; CHECK: successors: %bb.8(0x80000000) + ; CHECK: liveins: $lr, $r12, $r3 + ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r3, 4, 14, $noreg + ; CHECK: renamable $r1 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.8.while.body: + ; CHECK: successors: %bb.8(0x7c000000), %bb.9(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1 + ; CHECK: renamable $s0 = VLDRS renamable $r1, 1, 14, $noreg :: (load 4 from %ir.scevgep3) + ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 4, 14, $noreg + ; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VABSS killed renamable $s0, 14, $noreg + ; CHECK: VSTRS killed renamable $s0, renamable $r0, 1, 14, $noreg :: (store 4 into %ir.scevgep7) + ; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 4, 14, $noreg + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.8 + ; CHECK: bb.9.while.end: + ; CHECK: tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc + bb.0.entry: + successors: %bb.9(0x30000000), %bb.1(0x50000000) + liveins: $r0, $r1, $r2, $r4, $r5, $r7, $lr + + frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + frame-setup CFI_INSTRUCTION offset $r5, -12 + frame-setup CFI_INSTRUCTION offset $r4, -16 + tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + tBcc %bb.9, 0, killed $cpsr + + bb.1.while.body.preheader: + successors: %bb.6(0x40000000), %bb.2(0x40000000) + liveins: $r0, $r1, $r2 + + tCMPi8 renamable $r2, 4, 14, $noreg, implicit-def $cpsr + tBcc %bb.6, 3, killed $cpsr + + bb.2.vector.memcheck: + successors: %bb.3(0x40000000), %bb.6(0x40000000) + liveins: $r0, $r1, $r2 + + renamable $r3 = t2ADDrs renamable $r0, renamable $r2, 18, 14, $noreg, $noreg + tCMPr killed renamable $r3, renamable $r1, 14, $noreg, implicit-def $cpsr + t2IT 8, 4, implicit-def $itstate + renamable $r3 = t2ADDrs renamable $r1, renamable $r2, 18, 8, $cpsr, $noreg, implicit $itstate + tCMPr killed renamable $r3, renamable $r0, 8, killed $cpsr, implicit-def $cpsr, implicit killed $itstate + tBcc %bb.6, 8, killed $cpsr + + bb.3.vector.ph: + successors: %bb.4(0x80000000) + liveins: $r0, $r1, $r2 + + renamable $r4 = t2BICri renamable $r2, 3, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $r12 = t2SUBri renamable $r4, 4, 14, $noreg, $noreg + renamable $r7, dead $cpsr = tSUBrr renamable $r2, renamable $r4, 14, $noreg + renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + renamable $r12 = t2ADDrs renamable $r0, renamable $r4, 18, 14, $noreg, $noreg + t2DoLoopStart renamable $r3 + renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 16, 14, $noreg + $r5 = tMOVr killed $r3, 14, $noreg + renamable $r3 = t2ADDrs renamable $r1, renamable $r4, 18, 14, $noreg, $noreg + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 16, 14, $noreg + + bb.4.vector.body: + successors: %bb.4(0x7c000000), %bb.5(0x04000000) + liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $r12 + + renamable $r0, renamable $q0 = MVE_VLDRWU32_pre killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.scevgep18, align 4) + $lr = tMOVr killed $r5, 14, $noreg + renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VABSf32 killed renamable $q0, 0, $noreg, undef renamable $q0 + renamable $r1 = MVE_VSTRBU8_pre killed renamable $q0, killed renamable $r1, 16, 0, $noreg :: (store 16 into %ir.scevgep13, align 4) + renamable $lr = t2LoopDec killed renamable $lr, 1 + $r5 = tMOVr $lr, 14, $noreg + t2LoopEnd killed renamable $lr, %bb.4, implicit-def dead $cpsr + tB %bb.5, 14, $noreg + + bb.5.middle.block: + successors: %bb.7(0x80000000) + liveins: $r2, $r3, $r4, $r7, $r12 + + tCMPr killed renamable $r4, killed renamable $r2, 14, $noreg, implicit-def $cpsr + $lr = tMOVr killed $r7, 14, $noreg + t2IT 0, 8, implicit-def $itstate + tPOP_RET 0, killed $cpsr, def $r4, def $r5, def $r7, def $pc, implicit killed $itstate + tB %bb.7, 14, $noreg + + bb.6: + successors: %bb.7(0x80000000) + liveins: $r0, $r1, $r2 + + $lr = tMOVr killed $r2, 14, $noreg + $r12 = tMOVr killed $r0, 14, $noreg + $r3 = tMOVr killed $r1, 14, $noreg + + bb.7.while.body.preheader19: + successors: %bb.8(0x80000000) + liveins: $lr, $r3, $r12 + + renamable $r0, dead $cpsr = tSUBi3 killed renamable $r3, 4, 14, $noreg + renamable $r1 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.8.while.body: + successors: %bb.8(0x7c000000), %bb.9(0x04000000) + liveins: $lr, $r0, $r1 + + renamable $s0 = VLDRS renamable $r1, 1, 14, $noreg :: (load 4 from %ir.scevgep3) + renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 4, 14, $noreg + renamable $s0 = nnan ninf nsz arcp contract afn reassoc VABSS killed renamable $s0, 14, $noreg + VSTRS killed renamable $s0, renamable $r0, 1, 14, $noreg :: (store 4 into %ir.scevgep7) + renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 4, 14, $noreg + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.8, implicit-def dead $cpsr + tB %bb.9, 14, $noreg + + bb.9.while.end: + tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-while.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-while.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-while.mir @@ -1,30 +1,20 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s -# CHECK: body: -# CHECK: bb.0.entry: -# CHECK: t2CMPri $r3, 0, 14 -# CHECK-NEXT: t2Bcc %bb.3, 0, killed $cpsr -# CHECK-NEXT: tB %bb.1 -# CHECK: bb.1.do.body.preheader: -# CHECK: $lr = tMOVr killed $r3 -# CHECK: bb.2.do.body: -# CHECK: $lr = t2SUBri killed renamable $lr, 1, 14, $noreg, def $cpsr -# CHECK-NEXT: t2Bcc %bb.2, 1, killed $cpsr -# CHECK-NEXT: tB %bb.3, 14 --- | target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main" - + define void @ne_trip_count(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) #0 { entry: %0 = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) br i1 %0, label %do.body.preheader, label %if.end - + do.body.preheader: ; preds = %entry %scevgep2 = getelementptr i32, i32* %a, i32 -1 %scevgep5 = getelementptr i32, i32* %b, i32 -1 br label %do.body - + do.body: ; preds = %do.body, %do.body.preheader %lsr.iv6 = phi i32* [ %scevgep5, %do.body.preheader ], [ %scevgep7, %do.body ] %lsr.iv = phi i32* [ %scevgep2, %do.body.preheader ], [ %scevgep3, %do.body ] @@ -39,20 +29,20 @@ %scevgep3 = getelementptr i32, i32* %lsr.iv, i32 1 %scevgep7 = getelementptr i32, i32* %lsr.iv6, i32 1 br i1 %3, label %do.body, label %if.end - + if.end: ; preds = %do.body, %entry ret void } - + ; Function Attrs: nounwind declare i32 @llvm.arm.space(i32 immarg, i32) #1 - + ; Function Attrs: noduplicate nounwind declare i1 @llvm.test.set.loop.iterations.i32(i32) #2 - + ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #2 - + attributes #0 = { "target-features"="+lob" } attributes #1 = { nounwind } attributes #2 = { noduplicate nounwind } @@ -94,46 +84,74 @@ restorePoint: '' fixedStack: [] stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: ne_trip_count + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK: liveins: $r1, $r3, $r2, $r7, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: t2CMPri $r3, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: t2Bcc %bb.3, 0, killed $cpsr + ; CHECK: tB %bb.1, 14, $noreg + ; CHECK: bb.1.do.body.preheader: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r1, $r3, $r2 + ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg + ; CHECK: $lr = tMOVr killed $r3, 14, $noreg + ; CHECK: bb.2.do.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $lr, $r1, $r0 + ; CHECK: dead renamable $r2 = SPACE 4096, undef renamable $r0 + ; CHECK: renamable $r2, renamable $r0 = t2LDR_PRE killed renamable $r0, 4, 14, $noreg :: (load 4 from %ir.scevgep) + ; CHECK: early-clobber renamable $r1 = t2STR_PRE killed renamable $r2, killed renamable $r1, 4, 14, $noreg :: (store 4 into %ir.scevgep1) + ; CHECK: $lr = t2SUBri killed renamable $lr, 1, 14, $noreg, def $cpsr + ; CHECK: t2Bcc %bb.2, 1, killed $cpsr + ; CHECK: tB %bb.3, 14, $noreg + ; CHECK: bb.3.if.end: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x40000000), %bb.3(0x40000000) liveins: $r1, $r2, $r3, $r7, $lr - + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 frame-setup CFI_INSTRUCTION offset $r7, -8 t2WhileLoopStart $r3, %bb.3, implicit-def dead $cpsr tB %bb.1, 14, $noreg - + bb.1.do.body.preheader: successors: %bb.2(0x80000000) liveins: $r1, $r2, $r3 - + renamable $r0, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg $lr = tMOVr killed $r3, 14, $noreg - + bb.2.do.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1 - + dead renamable $r2 = SPACE 4096, undef renamable $r0 renamable $r2, renamable $r0 = t2LDR_PRE killed renamable $r0, 4, 14, $noreg :: (load 4 from %ir.scevgep) early-clobber renamable $r1 = t2STR_PRE killed renamable $r2, killed renamable $r1, 4, 14, $noreg :: (store 4 into %ir.scevgep1) renamable $lr = t2LoopDec killed renamable $lr, 1 t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr tB %bb.3, 14, $noreg - + bb.3.if.end: tPOP_RET 14, $noreg, def $r7, def $pc Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-def-no-mov.mir @@ -1,14 +1,12 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s -# CHECK: $lr = t2DLS killed $r0 -# CHECK: $lr = tMOVr $r0, 14 -# CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 # TODO: Explore the preheader to remove the redundant tMOVr --- | target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main" - + define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) { entry: %scevgep = getelementptr i32, i32* %q, i32 -1 @@ -18,7 +16,7 @@ preheader: br label %while.body - + while.body: ; preds = %while.body, %entry %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %preheader ] %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %preheader ] @@ -32,14 +30,14 @@ %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) %3 = icmp ne i32 %2, 0 br i1 %3, label %while.body, label %while.end - + while.end: ; preds = %while.body ret i32 0 } - + declare void @llvm.set.loop.iterations.i32(i32) #0 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 - + attributes #0 = { noduplicate nounwind } attributes #1 = { nounwind } @@ -80,20 +78,44 @@ restorePoint: '' fixedStack: [] stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: do_copy + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $r2, $r1, $r0, $r7, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $lr = t2DLS killed $r0 + ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg + ; CHECK: renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg + ; CHECK: bb.1.preheader: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r1, $r0 + ; CHECK: $lr = tMOVr $r0, 14, $noreg + ; CHECK: bb.2.while.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1 + ; CHECK: renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6) + ; CHECK: early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2) + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: bb.3.while.end: + ; CHECK: $r0, dead $cpsr = tMOVi8 0, 14, $noreg + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 bb.0.entry: successors: %bb.1(0x80000000) liveins: $r0, $r1, $r2, $r7, $lr - + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -106,17 +128,17 @@ successors: %bb.2(0x80000000) liveins: $r0 $lr = tMOVr $r0, 14, $noreg - + bb.2.while.body: successors: %bb.2(0x7c000000), %bb.3(0x04000000) liveins: $lr, $r0, $r1 - + renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6) early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2) renamable $lr = t2LoopDec killed renamable $lr, 1 t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr tB %bb.3, 14, $noreg - + bb.3.while.end: $r0, dead $cpsr = tMOVi8 0, 14, $noreg tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir @@ -1,30 +1,27 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=armv8.1m.main -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s -# CHECK: entry: -# CHECK: $lr = t2DLS -# CHECK: for.body: -# CHECK: $lr = t2LEUpdate killed renamable $lr --- | ; ModuleID = 'size-limit.ll' source_filename = "size-limit.ll" target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main" - + define dso_local arm_aapcscc void @size_limit(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader - + for.body.preheader: ; preds = %entry %scevgep = getelementptr i32, i32* %a, i32 -1 %scevgep4 = getelementptr i32, i32* %c, i32 -1 %scevgep8 = getelementptr i32, i32* %b, i32 -1 call void @llvm.set.loop.iterations.i32(i32 %N) br label %for.body - + for.cond.cleanup: ; preds = %for.body, %entry ret void - + for.body: ; preds = %for.body, %for.body.preheader %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] @@ -45,19 +42,19 @@ %4 = icmp ne i32 %3, 0 br i1 %4, label %for.body, label %for.cond.cleanup } - + ; Function Attrs: nounwind declare i32 @llvm.arm.space(i32 immarg, i32) #0 - + ; Function Attrs: noduplicate nounwind declare void @llvm.set.loop.iterations.i32(i32) #1 - + ; Function Attrs: noduplicate nounwind declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 - + ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #0 - + attributes #0 = { nounwind } attributes #1 = { noduplicate nounwind } @@ -99,20 +96,46 @@ restorePoint: '' fixedStack: [] stack: - - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } callSites: [] constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: size_limit + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $r2, $r3, $r0, $r1, $r7, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tCMPi8 $r3, 0, 14, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 8, implicit-def $itstate + ; CHECK: tPOP_RET 0, killed $cpsr, def dead $r7, def $pc, implicit killed $itstate + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg + ; CHECK: renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg + ; CHECK: $lr = t2DLS killed $r3 + ; CHECK: bb.1.for.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $r0, $r2, $r1 + ; CHECK: dead renamable $r3 = SPACE 4070, undef renamable $r0 + ; CHECK: renamable $r12, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep3) + ; CHECK: renamable $r3, renamable $r2 = t2LDR_PRE killed renamable $r2, 4, 14, $noreg :: (load 4 from %ir.scevgep7) + ; CHECK: renamable $r3 = nsw t2MUL killed renamable $r3, killed renamable $r12, 14, $noreg + ; CHECK: early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep11) + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc bb.0.entry: successors: %bb.1(0x80000000) liveins: $r0, $r1, $r2, $r3, $r7, $lr - + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 @@ -125,11 +148,11 @@ renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg $lr = tMOVr $r3, 14, $noreg t2DoLoopStart killed $r3 - + bb.1.for.body: successors: %bb.1(0x7c000000), %bb.2(0x04000000) liveins: $lr, $r0, $r1, $r2 - + dead renamable $r3 = SPACE 4070, undef renamable $r0 renamable $r12, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep3) renamable $r3, renamable $r2 = t2LDR_PRE killed renamable $r2, 4, 14, $noreg :: (load 4 from %ir.scevgep7) @@ -138,7 +161,7 @@ renamable $lr = t2LoopDec killed renamable $lr, 1 t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr tB %bb.2, 14, $noreg - + bb.2.for.cond.cleanup: tPOP_RET 14, $noreg, def $r7, def $pc Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/while.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/while.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/while.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob %s -run-pass=arm-low-overhead-loops --verify-machineinstrs -o - | FileCheck %s -# TODO: Remove the lr = tMOVr + +# TODO: Remove the $lr = tMOVr killed $r2 in the preheader --- | target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir @@ -195,22 +195,21 @@ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 - ; CHECK: $lr = MVE_WLSTP_8 renamable $r3, %bb.1 + ; CHECK: $lr = MVE_WLSTP_8 killed renamable $r3, %bb.1 ; CHECK: tB %bb.3, 14, $noreg ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $lr, $r1, $r2, $r3, $r0 + ; CHECK: liveins: $lr, $r1, $r2, $r0 ; CHECK: renamable $r12 = t2MOVi 0, 14, $noreg, $noreg ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.3(0x04000000), %bb.2(0x7c000000) - ; CHECK: liveins: $lr, $r12, $r2, $r3, $r0, $r1 + ; CHECK: liveins: $lr, $r12, $r2, $r0, $r1 ; CHECK: renamable $r4 = t2ADDrr renamable $r1, renamable $r12, 14, $noreg, $noreg ; CHECK: renamable $q0 = MVE_VLDRBU8 killed renamable $r4, 0, 0, $noreg :: (load 16 from %ir.scevgep45, align 1) ; CHECK: renamable $r4 = t2ADDrr renamable $r2, renamable $r12, 14, $noreg, $noreg ; CHECK: renamable $q1 = MVE_VLDRBU8 killed renamable $r4, 0, 0, $noreg :: (load 16 from %ir.scevgep23, align 1) ; CHECK: renamable $r4 = t2ADDrr renamable $r0, renamable $r12, 14, $noreg, $noreg ; CHECK: renamable $r12 = t2ADDri killed renamable $r12, 16, 14, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg ; CHECK: renamable $q0 = MVE_VMULi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 ; CHECK: MVE_VSTRBU8 killed renamable $q0, killed renamable $r4, 0, 0, killed $noreg :: (store 16 into %ir.scevgep1, align 1) ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 @@ -313,16 +312,16 @@ ; CHECK-LABEL: name: test_wlstp16 ; CHECK: bb.0.entry: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: liveins: $r1, $r3, $r0, $r2, $r7, $lr + ; CHECK: liveins: $r1, $r0, $r2, $r3, $r7, $lr ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: $lr = MVE_WLSTP_16 renamable $r3, %bb.1 + ; CHECK: $lr = MVE_WLSTP_16 killed renamable $r3, %bb.1 ; CHECK: tB %bb.2, 14, $noreg ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; CHECK: liveins: $lr, $r3, $r0, $r2, $r1 + ; CHECK: liveins: $lr, $r0, $r2, $r1 ; CHECK: renamable $q0 = MVE_VLDRHU16 renamable $r1, 0, 0, $noreg :: (load 16 from %ir.lsr.iv57, align 2) ; CHECK: renamable $q1 = MVE_VLDRHU16 renamable $r2, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 2) ; CHECK: renamable $q0 = MVE_VMULi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 @@ -330,7 +329,6 @@ ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg ; CHECK: renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 16, 14, $noreg ; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg - ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14, $noreg ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 ; CHECK: bb.2.for.cond.cleanup: ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc