Index: llvm/include/llvm/CodeGen/MachineLoopUtils.h =================================================================== --- llvm/include/llvm/CodeGen/MachineLoopUtils.h +++ llvm/include/llvm/CodeGen/MachineLoopUtils.h @@ -10,6 +10,7 @@ #define LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H namespace llvm { +class MachineLoop; class MachineBasicBlock; class MachineRegisterInfo; class TargetInstrInfo; @@ -36,6 +37,10 @@ MachineRegisterInfo &MRI, const TargetInstrInfo *TII); +/// Return true if PhysReg is live outside the loop, i.e. determine if it +/// is live in the loop exit blocks, and false otherwise. +bool isRegLiveInExitBlocks(MachineLoop *Loop, int PhysReg); + } // namespace llvm #endif // LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H Index: llvm/include/llvm/CodeGen/ReachingDefAnalysis.h =================================================================== --- llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -110,6 +110,9 @@ /// use or a live out. bool isRegUsedAfter(MachineInstr *MI, int PhysReg); + /// Provides the first instruction before MI that uses PhysReg + MachineInstr *isRegUsedBefore(MachineInstr *MI, int PhysReg); + /// Provides the clearance - the number of instructions since the closest /// reaching def instuction of PhysReg that reaches MI. int getClearance(MachineInstr *MI, MCPhysReg PhysReg); Index: llvm/lib/CodeGen/MachineLoopUtils.cpp =================================================================== --- llvm/lib/CodeGen/MachineLoopUtils.cpp +++ llvm/lib/CodeGen/MachineLoopUtils.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineLoopUtils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -130,3 +131,14 @@ return NewBB; } + +bool llvm::isRegLiveInExitBlocks(MachineLoop *Loop, int PhysReg) { + SmallVector ExitBlocks; + Loop->getExitBlocks(ExitBlocks); + + for (auto *MBB : ExitBlocks) + if (MBB->isLiveIn(PhysReg)) + return true; + + return false; +} Index: llvm/lib/CodeGen/ReachingDefAnalysis.cpp =================================================================== --- llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -227,7 +227,7 @@ } void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg, - SmallVectorImpl &Uses) { + SmallVectorImpl &Uses) { MachineBasicBlock *MBB = Def->getParent(); MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def); while (++MI != MBB->end()) { @@ -272,3 +272,13 @@ return false; } +MachineInstr *ReachingDefAnalysis::isRegUsedBefore(MachineInstr *MI, + int PhysReg) { + MachineBasicBlock::iterator It = MachineBasicBlock::iterator(MI); + while (--It != MI->getParent()->front()) + for (auto &MO : It->operands()) + if (MO.isReg() && MO.isUse() && MO.getReg() == PhysReg) + return &*It; + + return nullptr; +} Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -24,6 +24,7 @@ #include "ARMSubtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineLoopUtils.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/ReachingDefAnalysis.h" @@ -200,6 +201,8 @@ void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; + void RemoveLoopUpdate(LowOverheadLoop &LoLoop); + void RemoveVPTBlocks(LowOverheadLoop &LoLoop); MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); @@ -511,7 +514,7 @@ MIB.addImm(0); MIB.addImm(ARMCC::AL); MIB.addReg(ARM::NoRegister); - + MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc; @@ -631,6 +634,58 @@ return &*MIB; } +// Goal is to optimise and clean-up these loops: +// +// vector.body: +// renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg +// renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4 +// .. +// $lr = MVE_DLSTP_32 renamable $r3 +// +// The SUB is the old update of the loop iteration count expression, which +// is no longer needed. This sub is removed when the element count, which is in +// r3 in this example, is defined by an instruction in the loop, and it has +// no uses. +// +void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) { + Register ElemCount = LoLoop.VCTP->getOperand(1).getReg(); + MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back(); + + LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n"); + LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing MO: "; + LoLoop.VCTP->getOperand(1).dump()); + + // 1. Find the definition we are interested in removing, if there is one. + MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount); + if (!Def) + return; + + // 2. Bail if elemcount is used in exit blocks, i.e. if it is live-in. + if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n"); + return; + } + + // 3. Bail if there are uses after this Def in the block. + SmallVector Uses; + RDA->getReachingLocalUses(LastInstrInBlock, ElemCount, Uses); + if (Uses.size()) { + LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n"); + return; + } + + // 4. And also analyse the uses before this Def in this loop block. + if (auto *Use = RDA->isRegUsedBefore(Def, ElemCount)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Found a use: "; Use->dump()); + LLVM_DEBUG(dbgs() << "ARM Loops: Can't remove stmt\n"); + return; + } + + LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: "; + Def->dump()); + Def->eraseFromParent(); +} + void ARMLowOverheadLoops::RemoveVPTBlocks(LowOverheadLoop &LoLoop) { LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP); LoLoop.VCTP->eraseFromParent(); @@ -703,8 +758,10 @@ RemoveDeadBranch(LoLoop.Start); LoLoop.End = ExpandLoopEnd(LoLoop); RemoveDeadBranch(LoLoop.End); - if (LoLoop.IsTailPredicationLegal()) + if (LoLoop.IsTailPredicationLegal()) { + RemoveLoopUpdate(LoLoop); RemoveVPTBlocks(LoLoop); + } } } Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -41,7 +41,6 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.f32 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB0_5 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -388,7 +388,6 @@ ; CHECK-NEXT: vldrb.u32 q1, [r5] ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB5_5 @@ -593,7 +592,6 @@ ; CHECK-NEXT: vldrh.s32 q0, [r0], #8 ; CHECK-NEXT: vldrh.s32 q1, [r1], #8 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB6_1 @@ -685,7 +683,6 @@ ; CHECK-NEXT: vldrb.u32 q1, [r5] ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB7_5 @@ -890,7 +887,6 @@ ; CHECK-NEXT: vldrh.u32 q0, [r0], #8 ; CHECK-NEXT: vldrh.u32 q1, [r1], #8 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB8_1 @@ -980,7 +976,6 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB9_5 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -198,7 +198,6 @@ ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB3_1 @@ -250,7 +249,6 @@ ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB4_1 @@ -363,7 +361,6 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r2], #16 -; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vmul.i16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB6_1